feat(physics): wire physx sdk into build

2026-04-15 12:22:15 +08:00
parent 5bf258df6d
commit 31f40e2cbb
2044 changed files with 752623 additions and 1 deletions
--- a/engine/third_party/physx/source/gpunarrowphase/include/PxgContactManager.h
+++ b/engine/third_party/physx/source/gpunarrowphase/include/PxgContactManager.h
@@ -0,0 +1,48 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PXG_CONTACT_MANAGER_H
+#define PXG_CONTACT_MANAGER_H
+
+#include "foundation/PxSimpleTypes.h"
+
+namespace physx
+{
+	struct PX_ALIGN_PREFIX(16) PxgContactManagerInput
+	{
+		//Body refs are not needed here. World-space transforms are computed using transforCacheRefs instead!
+		PxU32 shapeRef0;					//Ref to shape 0
+		PxU32 shapeRef1;					//Ref to shape 1
+		PxU32 transformCacheRef0;			//Ref to shape0's transforms in transform cache
+		PxU32 transformCacheRef1;			//Ref to shape1's transform in transform cache
+	}
+	PX_ALIGN_SUFFIX(16);
+}
+
+#endif
+
--- a/engine/third_party/physx/source/gpunarrowphase/include/PxgConvexConvexShape.h
+++ b/engine/third_party/physx/source/gpunarrowphase/include/PxgConvexConvexShape.h
@@ -0,0 +1,69 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. 
+
+#ifndef PXG_CONVEX_CONVEX_SHAPE_H
+#define PXG_CONVEX_CONVEX_SHAPE_H
+
+#include "geometry/PxMeshScale.h"
+
+#define PXG_MAX_PCM_CONTACTS 4
+
+namespace physx
+{
+	template<PxU32 byteSize> 
+	class FlexiblePad
+	{
+		PxU8							pad[byteSize];
+	};
+
+	template<> 
+	class FlexiblePad<0>
+	{
+	};
+
+	//ML: PxgShape don't need to have contactOffset because we will dma a separated contact offset array later
+	struct PX_ALIGN_PREFIX(16) PxgShape
+	{
+		PxMeshScale						scale;									//28
+		PxU32							materialIndex;							//32
+		size_t							hullOrMeshPtr;							//36 or 40
+
+		PxU32							type;									//40 or 44
+		PxU32							particleOrSoftbodyId;					//44 or 48		
+
+		//FlexiblePad<16 - sizeof(size_t) - sizeof(PxU32)>	pad;			//48
+#if !PX_P64_FAMILY
+		PxU32    pad0;
+#endif
+	}
+	PX_ALIGN_SUFFIX(16);
+
+	PX_COMPILE_TIME_ASSERT(sizeof(PxgShape) == 48);
+}
+
+#endif
--- a/engine/third_party/physx/source/gpunarrowphase/include/PxgGeometryManager.h
+++ b/engine/third_party/physx/source/gpunarrowphase/include/PxgGeometryManager.h
@@ -0,0 +1,218 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef PXG_GEOMETRY_MANAGER_H
+#define PXG_GEOMETRY_MANAGER_H
+
+#include "foundation/PxPreprocessor.h"
+
+#include "foundation/PxArray.h"
+#include "foundation/PxHashMap.h"
+#include "foundation/PxSimpleTypes.h"
+
+#include "PxgCopyManager.h"
+
+#if PX_LINUX && PX_CLANG
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdocumentation"
+#pragma clang diagnostic ignored "-Wdisabled-macro-expansion"
+#endif
+#include "cuda.h"
+#if PX_LINUX && PX_CLANG
+#pragma clang diagnostic pop
+#endif
+
+// AD: PxgGeometryManager manages the CPU/GPU data transfers and the lifetime of collision geometries: Convex Hulls, Trimeshes, SDFs, Heightfields.
+
+/*
+	Short overview: 
+
+	- geometries are added using the add* functions.
+	- during add, device memory is allocated and indices are given to geometries. We need to allocate the device memory here despite not writing to it
+	  because the pointer is uploaded to GPU together with the shapeInstance data, which happens before we perform the final copy of the geometry in scheduleCopyHtoD.
+	- We also accumulate the pinned memory requirements for the staging buffer during add.
+	- scheduleCopyHtoD is performing the bulk of the work:
+	    - we allocate the proper amount of pinned memory
+		- for each scheduled geometry, we lay it out in the pinned memory
+		- we push the copy descriptor to the copy manager.
+
+	- the pinned memory is deallocated and the queues reset in resetAfterMemCopyCompleted().
+
+	Future things:
+	- look into the sequences of device uploads: we might be able to defer device memory allocations and use pools if we can rearrange things such that
+	  we don't need the device pointer upfront.
+	- The pinned memory is contiguous, so the copy manager taking pairs of (src, dest) could potentially be replaced by a linear thing.
+*/
+
+namespace physx
+{
+
+class PxgHeapMemoryAllocatorManager;
+class PxgHeapMemoryAllocator;
+class PxCudaContext;
+
+namespace Gu
+{
+    struct ConvexHullData;
+    class TriangleMesh;
+    struct HeightFieldData;
+};
+
+template<typename ArrayT>
+class PxgFreeIndicesProvider
+{
+	PX_NOCOPY(PxgFreeIndicesProvider)
+public:
+	PxgFreeIndicesProvider(ArrayT& ref): mDataArray(ref) {}
+
+	PxU32							getFreeIndex()
+	{
+		if (!mFreeIndices.empty())
+		{
+			return mFreeIndices.popBack();
+		}
+		else
+		{
+			PxU32 currSz = mDataArray.size();
+			
+			if (currSz + 1 > mDataArray.capacity())
+			{
+				mDataArray.reserve(currSz * 2);
+			}
+
+			mDataArray.resize(currSz + 1);
+
+			return currSz;
+		}
+	}
+
+	void setFreeIndex(PxU32 idx)
+	{
+		PX_ASSERT(idx < mDataArray.size());
+		mDeferredFreeIndices.pushBack(idx);
+	}
+
+	void releaseFreeIndices()
+	{
+		for (PxU32 a = 0; a < mDeferredFreeIndices.size(); ++a)
+		{
+			mFreeIndices.pushBack(mDeferredFreeIndices[a]);
+		}
+		mDeferredFreeIndices.forceSize_Unsafe(0);
+	}
+
+private:
+	PxArray<PxU32>				mDeferredFreeIndices;
+	PxArray<PxU32>				mFreeIndices;
+	ArrayT&						mDataArray;
+};
+
+struct PxgMeshTextureData
+{
+	CUarray cuArray;
+	CUtexObject cuTexRef;
+
+	CUarray cuArraySubgrids;
+	CUtexObject cuTexRefSubgrids;
+};
+
+struct HullOrMeshData
+{
+	void*		mDeviceMemPointer;
+	PxU32		mCopyDescIndex;
+
+	static const PxU32 INVALID_COPY_DESC_INDEX = 0xFFffFFff;
+};
+class PxgGeometryManager;
+// Holds all the data needed to upload a geometry to the GPU. Created when calling add* functions, read during scheduleCopyHtoD.
+struct ScheduledCopyData
+{
+private:
+	// AD this is local, not intended for widespread usage.
+	struct UploadGeometryType
+	{
+		enum Enum
+		{
+			eTRIANGLEMESH = 1,
+			eCONVEXHULL   = 2,
+			eHEIGHTFIELD  = 3,
+			eBOXHULL      = 4
+		};
+	};
+public:
+	PxgCopyManager::CopyDesc         mCopyDesc;            //!< Copy descriptor for upload to GPU. lives until copy completed.
+	PxU32                            mHullOrTrimeshIdx;    //!< Index into mGeometrydata
+	UploadGeometryType::Enum         mType;                //!< Geometry type: see UploadGeometryType. Needed for deferred pinned memory allocation.
+	PxU32                            mNumPolyVertices;     //!< Number of polygon vertices in case this is a convex hull
+	const void*                      mGeometryPtr;         //!< Pointer to CPU geometry data.
+	friend class PxgGeometryManager;
+};
+
+class PxgGeometryManager
+{
+	PX_NOCOPY(PxgGeometryManager)
+	using UploadGeometryType = ScheduledCopyData::UploadGeometryType;
+
+public:
+	PxgGeometryManager(PxgHeapMemoryAllocatorManager* heapMemoryManager);
+
+	virtual ~PxgGeometryManager();
+
+	PxU32							addHull(const Gu::ConvexHullData& hull);
+	PxU32							addTriMesh(const Gu::TriangleMesh& triMesh);
+	PxU32							addHeightfield(const Gu::HeightFieldData& hf);
+	void							addBoxHull();
+
+	void							removeGeometry(PxU32 idx);
+	
+	void							scheduleCopyHtoD(PxgCopyManager& copyMan, PxCudaContext& cudaContext, CUstream stream);
+	CUdeviceptr						getGeometryDevPtrByIndex(PxU32 idx) const;
+	CUdeviceptr						getBoxHullDevPtr() const;
+	
+	void							resetAfterMemcpyCompleted();
+	
+private:
+	PxU32 							addGeometryInternal(PxU64 byteSize, const void* geomPtr, UploadGeometryType::Enum type, PxU32 numPolyVertices = 0);
+
+	PxgHeapMemoryAllocator*									mDeviceMemoryAllocator; //device memory
+	PxgHeapMemoryAllocator*									mPinnedMemoryAllocator; //mapped memory
+	void*													mPinnedMemoryBasePtr;   //stores the pointer to the current pinned memory allocation.
+	PxU64													mPinnedHostMemoryRequirements; // accumulates pinned memory requirements for copies.
+	
+	PxArray<ScheduledCopyData>								mScheduledCopies;       // copies in current step, cleared after scheduleCopyHtoD
+
+	PxArray<HullOrMeshData>									mGeometryData;          //persistent geometry data, holds idx -> device pointer mapping
+	PxgFreeIndicesProvider<PxArray<HullOrMeshData> >		mFreeGeometryIndices;   //freeIndex provider for mGeometry.
+	
+	PxU32													mBoxHullIdx;            //index into mGeometryData for hardcoded convex hull for box.
+
+	PxHashMap<void*, PxgMeshTextureData>					mMeshToTextureMap;      //maps meshes to SDF textures for lifetime management
+};
+};
+
+#endif
--- a/engine/third_party/physx/source/gpunarrowphase/include/PxgNarrowphase.h
+++ b/engine/third_party/physx/source/gpunarrowphase/include/PxgNarrowphase.h
@@ -0,0 +1,39 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PXG_NARROWPHASE_H
+#define PXG_NARROWPHASE_H
+
+namespace physx
+{
+	
+	//this is needed to force PhysXNarrowphaseGpu linkage as Static Library!
+	void createPxgNarrowphase();
+}
+
+#endif
--- a/engine/third_party/physx/source/gpunarrowphase/include/PxgNarrowphaseCore.h
+++ b/engine/third_party/physx/source/gpunarrowphase/include/PxgNarrowphaseCore.h
@@ -0,0 +1,817 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PXG_NARROWPHASE_CORE_H
+#define PXG_NARROWPHASE_CORE_H
+
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxTransform.h"
+#include "foundation/PxPinnedArray.h"
+#include "foundation/PxUserAllocated.h"
+#include "task/PxTask.h"
+#include "geometry/PxMeshScale.h"
+
+#include "foundation/PxMutex.h"
+#include "PxsContactManagerState.h"
+#include "PxgContactManager.h"
+#include "PxgPersistentContactManifold.h"
+#include "PxgCudaBuffer.h"
+#include "cudaNpCommon.h"
+
+#include "PxgCudaPagedLinearAllocator.h"
+#include "PxgCopyManager.h"
+#include "PxgConvexConvexShape.h"
+#include "PxgGeometryManager.h"
+#include "PxvNphaseImplementationContext.h"
+#include "PxgHeapMemAllocator.h"
+#include "PxgShapeManager.h"
+#include "PxgRadixSortDesc.h"
+#include "geometry/PxParticleSystemGeometry.h"
+#include "PxvGeometry.h"
+
+namespace physx
+{
+	class PxsTransformCache;
+	struct PxsMaterialData;
+	struct PxsCachedTransform;
+	struct PxcDataStreamPool;
+	class PxgCudaKernelWranglerManager;
+	class PxgNphaseImplementationContext;
+	
+	class PxRenderOutput;
+	class PxgGpuContext;
+	struct PxTriangleMeshGeometryLL;
+	class PxgParticleSystemCore;
+
+	struct PxgFemFemContactInfo;
+	struct PxgFemOtherContactInfo;
+
+	namespace Gu
+	{
+		struct ConvexHullData;
+		class TriangleMesh;
+		class DeformableVolumeMesh;
+		class BVTetrahedronMesh;
+		struct HeightFieldData;
+		class PersistentContactManifold;
+	}
+
+	namespace Cm
+	{
+		class FlushPool;
+	}
+
+	namespace IG
+	{
+		class IslandSim;
+	}
+
+
+	struct GPU_BUCKET_ID
+	{
+		enum Enum
+		{
+			eFallback = 0,
+			eConvex = 1,						//manifold
+			eConvexPlane = 2,					//manifold
+			eConvexTrimesh = 3,					//multi-manifold
+			eConvexHeightfield = 4,				//multi-manifold
+			eSphereTrimesh = 5,					//multi-manifold
+			eSphereHeightfield = 6,				//multi-manifold
+			eTrianglePlane = 7,					//multi-manifold - technically could be single manifold but needs 6 points
+
+			eSphere = 8,						//no manifold
+			eBoxBox = 9,
+
+			eConvexCorePlane = 10,
+			eConvexCoreConvex = 11,
+			eConvexCoreTrimesh = 12,
+			eConvexCoreTetmesh = 13,
+			eConvexCoreClothmesh = 14,
+
+			eTriangleHeightfield = 15,			//no mainfold
+			eTriangleTriangle = 16,				//no manifold
+			eSoftbody = 17,						//no manifold
+			eSoftbodies = 18,					//no manifold
+			eSoftbodyFemCloth = 19,				//no manifold
+			eSoftbodyTrimesh = 20,				//no manifold
+			eSoftbodySdfTrimesh = 21,			//no manifold
+			eSoftbodyHeightfield = 22,			//no manifold
+			eFemClothSphere = 23,				//no manifold
+			eFemClothPlane = 24,				//no manifold
+			eFemClothBox = 25,					//no manifold
+			eFemClothConvexes = 26,				//no manifold
+			eFemClothes = 27,					//no manifold
+			eFemClothTrimesh = 28,				//no manifold
+			eFemClothSdfTrimesh = 29,
+			eFemClothHeightfield = 30,			//no manifold
+			eConvexParticle = 31,				//no manifold
+			eParticlesystems = 32,				//no manifold
+			eParticlesystemSoftbody = 33,		//no manifold
+			eParticlesystemFemCloth = 34,		//no manifold
+			eParticlesystemTrimesh = 35,		//no manifold
+			eParticlesystemSdfTrimesh = 36,		//no manifold
+			eParticlesystemHeightfield = 37,	//no manifold
+
+			eCount
+		};
+	};
+
+	const PxU32 BUCKET_ManifoldSize[GPU_BUCKET_ID::eCount] = {
+		0,														//0
+		sizeof(PxgPersistentContactManifold),					//1
+		sizeof(PxgPersistentContactManifold),					//2
+		sizeof(PxgPersistentContactMultiManifold),				//3
+		sizeof(PxgPersistentContactMultiManifold),				//4
+		sizeof(PxgPersistentContactMultiManifold),				//5
+		sizeof(PxgPersistentContactMultiManifold),				//6
+		sizeof(PxgPersistentContactMultiManifold),				//7
+		0,														//8
+		0,														//9
+		0,														//10
+		0,														//11
+		0,														//12
+		0,														//13
+		0,														//14
+		0,														//15
+		0,														//16
+		0,														//17
+		0,														//18
+		0,														//19
+		0,														//20
+		0,														//21
+		0,														//22
+		0,														//23
+		0,														//24
+		0,														//25
+		0,														//26
+		0,														//27
+		0,														//28
+		0,														//29
+		0,														//30
+		0,														//31
+		0,														//32
+		0,														//33
+		0,														//34
+		0,														//35
+		0,														//36
+		0,														//37
+	};
+
+	struct PxgContactManagers : public PxsContactManagerBase
+	{
+		PxgContactManagers(const PxU32 bucketId, const PxVirtualAllocator& allocator) : PxsContactManagerBase(bucketId), 
+			mGpuInputContactManagers(allocator), 
+			mCpuContactManagerMapping(allocator),
+			mShapeInteractions(allocator),
+			mRestDistances(allocator),
+			mTorsionalProperties(allocator)
+		{
+		}
+
+		PxPinnedArray<PxgContactManagerInput>		mGpuInputContactManagers;
+		PxPinnedArray<PxsContactManager*>			mCpuContactManagerMapping;
+		PxPinnedArray<const Sc::ShapeInteraction*>	mShapeInteractions;
+		PxFloatArrayPinned							mRestDistances;
+		PxPinnedArray<PxsTorsionalFrictionData>		mTorsionalProperties;
+
+		void clear()
+		{
+			mGpuInputContactManagers.forceSize_Unsafe(0);
+			mCpuContactManagerMapping.forceSize_Unsafe(0);
+			mShapeInteractions.forceSize_Unsafe(0);
+			mRestDistances.forceSize_Unsafe(0);
+			mTorsionalProperties.forceSize_Unsafe(0);
+		}
+
+		void preallocateNewBuffers(PxU32 nbToPreallocate)
+		{
+			mGpuInputContactManagers.reserve(nbToPreallocate);
+			mCpuContactManagerMapping.reserve(nbToPreallocate);
+			mShapeInteractions.reserve(nbToPreallocate);
+			mRestDistances.reserve(nbToPreallocate);
+			mTorsionalProperties.reserve(nbToPreallocate);
+		}
+
+	private:
+		PX_NOCOPY(PxgContactManagers)
+	};
+
+	struct PxgNewContactManagers : public PxgContactManagers
+	{
+		PxPinnedArray<PxsContactManagerOutput>	mGpuOutputContactManagers;
+
+		PxgNewContactManagers(const PxU32 bucketIndex, const PxVirtualAllocator& allocator) : PxgContactManagers(bucketIndex, allocator),
+			mGpuOutputContactManagers(allocator)
+		{
+		}
+
+		void clear()
+		{
+			PxgContactManagers::clear();
+			mGpuOutputContactManagers.forceSize_Unsafe(0);
+		}
+
+		void preallocateNewBuffers(PxU32 nbToPreallocate)
+		{
+			mGpuOutputContactManagers.reserve(nbToPreallocate);
+			PxgContactManagers::preallocateNewBuffers(nbToPreallocate);
+		}
+	};
+
+	struct PxgGpuContactManagers
+	{
+		PxgTypedCudaBuffer<PxgContactManagerInput>    mContactManagerInputData;
+		PxgTypedCudaBuffer<PxsContactManagerOutput>   mContactManagerOutputData;
+		PxgCudaBuffer                                 mPersistentContactManifolds;
+
+		PxgTypedCudaBuffer<PxU32>                     mTempRunsumArray;
+		PxgTypedCudaBuffer<PxU32>                     mTempRunsumArray2;
+		PxgTypedCudaBuffer<PxU32>                     mBlockAccumulationArray;
+		PxgTypedCudaBuffer<PxsContactManagerOutputCounts> mLostFoundPairsOutputData;
+		PxgTypedCudaBuffer<PxsContactManager*>        mLostFoundPairsCms;
+		PxgTypedCudaBuffer<PxsContactManager*>        mCpuContactManagerMapping;
+		PxgTypedCudaBuffer<Sc::ShapeInteraction*>     mShapeInteractions;
+		PxgTypedCudaBuffer<PxReal>                    mRestDistances;
+		PxgTypedCudaBuffer<PxsTorsionalFrictionData>  mTorsionalProperties;
+		uint2*								mLostAndTotalReportedPairsCountPinned;	
+		const PxU32							mBucketIndex;
+		
+		PxgGpuContactManagers(const PxU32 bucketIndex, PxgHeapMemoryAllocatorManager* manager) :
+			mContactManagerInputData(manager, PxsHeapStats::eNARROWPHASE), 
+			mContactManagerOutputData(manager, PxsHeapStats::eNARROWPHASE),
+			mPersistentContactManifolds(manager, PxsHeapStats::eNARROWPHASE), 
+			mTempRunsumArray(manager, PxsHeapStats::eNARROWPHASE), 
+			mTempRunsumArray2(manager, PxsHeapStats::eNARROWPHASE),
+			mBlockAccumulationArray(manager, PxsHeapStats::eNARROWPHASE), 
+			mLostFoundPairsOutputData(manager, PxsHeapStats::eNARROWPHASE), 
+			mLostFoundPairsCms(manager, PxsHeapStats::eNARROWPHASE),
+			mCpuContactManagerMapping(manager, PxsHeapStats::eNARROWPHASE),
+			mShapeInteractions(manager, PxsHeapStats::eNARROWPHASE), 
+			mRestDistances(manager, PxsHeapStats::eNARROWPHASE), 
+			mTorsionalProperties(manager, PxsHeapStats::eNARROWPHASE),
+			mLostAndTotalReportedPairsCountPinned(NULL),
+			mBucketIndex(bucketIndex)
+		{
+			PX_ASSERT(bucketIndex < (1 << PxsContactManagerBase::MaxBucketBits));
+		}
+
+	private:
+		PX_NOCOPY(PxgGpuContactManagers)
+	};
+
+	struct PxgGpuPairManagementBuffers
+	{
+		PxgCudaBuffer					mTempRunsumArray;
+		PxgCudaBuffer					mBlockAccumulationArray;
+		PxgCudaBuffer					mRemovedIndicesArray;
+
+		PxgGpuPairManagementBuffers(PxgHeapMemoryAllocatorManager* manager): 
+			mTempRunsumArray(manager, PxsHeapStats::eNARROWPHASE), mBlockAccumulationArray(manager, PxsHeapStats::eNARROWPHASE), mRemovedIndicesArray(manager, PxsHeapStats::eNARROWPHASE)
+		{
+		}
+
+	private:
+		PX_NOCOPY(PxgGpuPairManagementBuffers)
+	};
+	
+	
+	class PxgMirroredContactManagerPair : public PxUserAllocated
+	{
+	public:
+		PxgMirroredContactManagerPair(const PxU32 bucketId, const PxVirtualAllocator& allocator) :
+			mContactManagers(bucketId, allocator),
+			mNewContactManagers(bucketId, allocator)
+		{
+		}
+
+		PX_FORCE_INLINE PxU32 getNbFirstPassTests() { return mContactManagers.mCpuContactManagerMapping.size(); }
+		PX_FORCE_INLINE PxU32 getNbSecondPassTests() { return mNewContactManagers.mCpuContactManagerMapping.size(); }
+
+		PX_FORCE_INLINE PxU32 getNbPassTests() { return mContactManagers.mCpuContactManagerMapping.size() + mNewContactManagers.mCpuContactManagerMapping.size(); }
+
+		PX_FORCE_INLINE void allocateContactManagers()
+		{
+			const PxU32 newPairCount = mContactManagers.mGpuInputContactManagers.size() + mNewContactManagers.mGpuInputContactManagers.size();
+
+			if (newPairCount > mContactManagers.mGpuInputContactManagers.capacity())
+			{
+				PxU32 newSize = PxMax(newPairCount, mContactManagers.mGpuInputContactManagers.capacity() * 2);
+
+				mContactManagers.mGpuInputContactManagers.reserve(newSize);
+
+				//mContactManagers.mCpuContactManagerMapping.reserve(newSize);
+			}
+		}
+
+		PxgContactManagers			mContactManagers;			//existing contact managers
+		PxgNewContactManagers		mNewContactManagers;		//new contact managers
+
+	};
+
+	class PxgGpuContactManagerPair : public PxUserAllocated
+	{
+	public:
+		PxgGpuContactManagerPair(const PxU32 bucketId, PxgHeapMemoryAllocatorManager* manager) :
+			mContactManagers(bucketId, manager),
+			mNewContactManagers(bucketId, manager)
+		{
+		}
+
+		PX_FORCE_INLINE void allocateBlockAccumulationArray(const PxU32 size)
+		{
+			mContactManagers.mBlockAccumulationArray.allocate(sizeof(PxU32) * size, PX_FL);
+			mNewContactManagers.mBlockAccumulationArray.allocate(sizeof(PxU32) * size, PX_FL);
+		}
+
+		PX_FORCE_INLINE void allocateLostAndTotalReportedPairsCount(PxsHeapMemoryAllocator* mappedMemoryAllocators)
+		{
+			mContactManagers.mLostAndTotalReportedPairsCountPinned = reinterpret_cast<uint2*>(mappedMemoryAllocators->allocate(sizeof(uint2), PxsHeapStats::eNARROWPHASE, PX_FL));
+			mNewContactManagers.mLostAndTotalReportedPairsCountPinned = reinterpret_cast<uint2*>(mappedMemoryAllocators->allocate(sizeof(uint2), PxsHeapStats::eNARROWPHASE, PX_FL));
+		}
+
+		PX_FORCE_INLINE PxU32 getTotalLostFoundPairs()
+		{
+			return mContactManagers.mLostAndTotalReportedPairsCountPinned->y + mNewContactManagers.mLostAndTotalReportedPairsCountPinned->y;
+		}
+
+		PX_FORCE_INLINE PxU32 getTotalLostFoundPatches()
+		{
+			return mContactManagers.mLostAndTotalReportedPairsCountPinned->x + mNewContactManagers.mLostAndTotalReportedPairsCountPinned->x;
+		}
+
+		PxgGpuContactManagers			mContactManagers;
+		PxgGpuContactManagers			mNewContactManagers;
+	};
+
+	struct PxsShapeCore;
+	struct PxgKernelIds;
+	
+	class PxgGpuNarrowphaseCore : public PxUserAllocated
+	{
+		PX_NOCOPY(PxgGpuNarrowphaseCore)
+
+	public:
+		PxgMirroredContactManagerPair*			mContactManagers[GPU_BUCKET_ID::eCount];		//some order as BUCKET_ID. 0 is fallback and it will be NULL in this array
+		PxgGpuContactManagerPair*				mGpuContactManagers[GPU_BUCKET_ID::eCount];	//some order as BUCKET_ID. 0 is fallback and it will be NULL in this array
+
+		PxgGpuPairManagementBuffers				mPairManagementBuffers;
+
+		PxgCudaBuffer							mGpuTransformCache;
+
+		PxgCudaBuffer							mGpuContactDistance;
+
+		typedef PxInt32ArrayPinned RemovedIndicesArray;
+
+		RemovedIndicesArray*					mRemovedIndices[GPU_BUCKET_ID::eCount];
+
+		PxBitMap										mKeepMap;
+
+		PxPinnedArray<PxsContactManagerOutputCounts>	mLostFoundPairsOutputData;
+		PxPinnedArray<PxsContactManager*>				mLostFoundPairsCms;
+
+		PxU32									mTotalLostFoundPairs;
+		PxU32									mTotalLostFoundPatches;
+		PxU32									mTotalNumPairs;
+
+		PxgPairManagementData* 					mPairManagementData[GPU_BUCKET_ID::eCount]; //mapped memory
+
+		PxgCudaBuffer							mGpuPairManagementData;
+	
+
+		PxPinnedArray<PxgRadixSortDesc>			mRSDesc;
+		PxgCudaBufferN<2>						mRadixSortDescBuf; //radix sort with rank
+
+		PxgCudaBuffer							mTempGpuRigidIndiceBuf;
+		PxgCudaBuffer							mTempGpuShapeIndiceBuf;
+		PxgCudaBuffer							mRadixCountTotalBuf;
+
+		CUdeviceptr								mContactStream;
+		CUdeviceptr								mPatchStream;
+		CUdeviceptr								mForceAndIndiceStream;
+		
+		PxgTypedCudaBuffer<PxgPatchAndContactCounters> mPatchAndContactCountersOnDevice; //device memory
+		PxgPatchAndContactCounters*             mPatchAndContactCountersReadback; //host memory
+
+
+		PxgShapeManager							mGpuShapesManager;
+		PxgMaterialManager						mGpuMaterialManager;
+		PxgFEMSoftBodyMaterialManager			mGpuFEMMaterialManager;
+		PxgFEMClothMaterialManager				mGpuFEMClothMaterialManager;
+		PxgPBDMaterialManager					mGpuPBDMaterialManager;
+
+		//device memory
+		PxgCudaPagedLinearAllocator<PxgHeapMemoryAllocator> mIntermStackAlloc;
+
+		CUstream								mStream;
+		CUstream								mSolverStream; //this is the stream handle belong to the solver, we can't destroy the solver stream
+		PxgCudaKernelWranglerManager*			mGpuKernelWranglerManager;
+		PxCudaContextManager*					mCudaContextManager;
+		PxCudaContext*							mCudaContext;
+		PxgHeapMemoryAllocatorManager*			mHeapMemoryManager;
+
+		PxgCopyManager							mCopyMan;
+	    PxgCopyManager							mCopyManBp;
+		PxgGeometryManager						mGeometryManager;
+
+		IG::IslandSim*							mIslandSim;
+
+		PxgNphaseImplementationContext*			mNphaseImplContext;
+		PxgGpuContext*							mGpuContext;
+
+		PxU32									mCollisionStackSizeBytes;
+
+		PxU32*									mMaxConvexMeshTempMemory;
+
+		struct RefcountedRecord
+		{
+			PxU32 refCnt;
+			PxU32 idx;
+		};
+
+		typedef PxHashMap<size_t, RefcountedRecord> RefcountedRecordsMap;
+		RefcountedRecordsMap*									mShapesMap;
+
+		RefcountedRecordsMap*									mGeometriesMap;
+		RefcountedRecordsMap*									mMaterialsMap;
+		RefcountedRecordsMap*									mFEMMaterialsMap;
+		RefcountedRecordsMap*									mFEMClothMaterialsMap;
+		RefcountedRecordsMap*									mPBDMaterialsMap;
+
+		PxgCudaBuffer											mGpuMultiManifold;
+		PxgCudaBuffer											mGpuManifold;
+
+		CUevent													mParticleEvent;
+		CUevent													mSoftbodyEvent;
+		CUevent													mFemClothEvent;
+		CUevent													mDirectApiDmaEvent;
+
+#if PX_ENABLE_SIM_STATS
+		PxU32													mGpuDynamicsRigidContactCountStats;
+		PxU32													mGpuDynamicsRigidPatchCountStats;
+		PxU32													mGpuDynamicsCollisionStackSizeStats;
+#else
+		PX_CATCH_UNDEFINED_ENABLE_SIM_STATS
+#endif
+		
+		void createGpuStreamsAndEvents();
+		void releaseGpuStreamsAndEvents();
+
+
+		void removeLostPairsGpu(const PxU32 bucketID, const PxU16 stage5KernelID, const bool copyManifold);
+		
+		void appendContactManagersGpu(PxU32 nbExistingManagers, PxU32 nbNewManagers, PxgGpuContactManagers& gpuContactManagers, PxgGpuContactManagers& newGpuContactManagers, PxU32 manifoldSize);
+		void drawManifold(PxgPersistentContactManifold* manifolds, PxgContactManagerInput* cmInput, PxsCachedTransform* cachedTransform, const PxU32 numTests, PxRenderOutput& renderOutput,
+			const PxU32 color, const PxF32 size);
+		void drawPoint(PxRenderOutput& renderOutput, PxVec3 point, const PxU32 color, const PxF32 size);
+		void drawLine(PxRenderOutput& out, PxVec3 a, const PxVec3 b, const PxU32 color);
+
+		template <typename ManagementData, typename Manifold>
+			void removeLostPairsGpuInternal( ManagementData& cpuBuffer, CUdeviceptr gpuBuffer,
+				PxgContactManagers& contactManagers, PxgGpuContactManagers& gpuContactManagers, PxInt32ArrayPinned& removedIndices, PxgGpuPairManagementBuffers& pairManagementBuffers,
+			PxU16 stage5KernelID, const bool copyManifold = true);
+
+			
+
+
+	public:
+
+		PxgGpuNarrowphaseCore(PxgCudaKernelWranglerManager* gpuKernelWrangler, PxCudaContextManager* cudaContextManager, const PxGpuDynamicsMemoryConfig& gpuDynamicsConfig, void* contactStreamBase,
+			void* patchStreamBase, void* forceAndIndiceStreamBase, IG::IslandSim* islandSim, CUstream solverStream, PxgHeapMemoryAllocatorManager* heapMemoryManager,
+			PxgNphaseImplementationContext* nphaseImplContext);
+		virtual ~PxgGpuNarrowphaseCore();		
+		
+		void testSDKSphereGpu(PxgGpuContactManagers& gpuManagers, const PxU32 numTests,
+			PxU8* baseContactPatches, PxU8* baseContactPoints, PxU8* baseContactForces, PxU32 patchBytesLimit, PxU32 contactBytesLimit, PxU32 forceBytesLimit,
+			PxRenderOutput* renderOutput);
+
+		void testSDKBoxBoxGpu(PxgGpuContactManagers& gpuManagers, const PxU32 numTests,
+			PxU8* baseContactPatches, PxU8* baseContactPoints, PxU8* baseContactForces, PxU32 patchBytesLimit, PxU32 contactBytesLimit, PxU32 forceBytesLimit,
+			PxRenderOutput* renderOutput);
+
+		void testSDKConvexConvexGjkEpaGpu(PxgGpuContactManagers& gpuManagers, bool insertAveragePoint, const PxU32 numTests,
+			PxU8* baseContactPatches, PxU8* baseContactPoints, PxU8* baseContactForces, PxU32 patchBytesLimit, PxU32 contactBytesLimit, PxU32 forceBytesLimit, 
+			PxRenderOutput* renderOutput);
+
+		void testSDKConvexPlaneGjkEpaGpu(PxgGpuContactManagers& gpuManagers, bool insertAveragePoint, const PxU32 numTests,
+			PxU8* baseContactPatches, PxU8* baseContactPoints, PxU8* baseContactForces, PxU32 patchBytesLimit, PxU32 contactBytesLimit, PxU32 forceBytesLimit,
+			PxRenderOutput* renderOutput);
+
+		void testSDKConvexCorePlaneGjkEpaGpu(PxgGpuContactManagers& gpuManagers, bool insertAveragePoint, const PxU32 numTests,
+			PxU8* baseContactPatches, PxU8* baseContactPoints, PxU8* baseContactForces, PxU32 patchBytesLimit, PxU32 contactBytesLimit, PxU32 forceBytesLimit,
+			PxRenderOutput* renderOutput);
+
+		void testSDKConvexCoreConvexGjkEpaGpu(PxgGpuContactManagers& gpuManagers, bool insertAveragePoint, const PxU32 numTests,
+			PxU8* baseContactPatches, PxU8* baseContactPoints, PxU8* baseContactForces, PxU32 patchBytesLimit, PxU32 contactBytesLimit, PxU32 forceBytesLimit,
+			PxRenderOutput* renderOutput);
+
+		void testSDKConvexCoreTrimeshGjkEpaGpu(PxgGpuContactManagers& gpuManagers, bool insertAveragePoint, const PxU32 numTests,
+			PxU8* baseContactPatches, PxU8* baseContactPoints, PxU8* baseContactForces, PxU32 patchBytesLimit, PxU32 contactBytesLimit, PxU32 forceBytesLimit,
+			PxRenderOutput* renderOutput);
+
+		void testSDKConvexCoreTetmeshGjkEpaGpu(PxgGpuContactManagers& gpuManagers, const PxU32 numTests, PxRenderOutput* renderOutput);
+
+		void testSDKConvexCoreClothmeshGjkEpaGpu(PxgGpuContactManagers& gpuManagers, const PxU32 numTests, PxRenderOutput* renderOutput);
+
+		void testSDKSphereTriMeshSATGpu(PxgGpuContactManagers& gpuManagers, bool insertAveragePoint, const PxU32 numTests,
+			PxU8* baseContactPatches, PxU8* baseContactPoints, PxU8* baseContactForces,
+			PxU32 patchBytesLimit, PxU32 contactBytesLimit, PxU32 forceBytesLimit);
+
+		void testSDKSphereHeightfieldGpu(PxgGpuContactManagers& gpuManagers, bool insertAveragePoint, const PxU32 numTests,
+			PxU8* baseContactPatches, PxU8* baseContactPoints, PxU8* baseContactForces,
+			PxU32 patchBytesLimit, PxU32 contactBytesLimit, PxU32 forceBytesLimit);
+
+		void testSDKTriMeshPlaneGpu(PxgGpuContactManagers& gpuManagers, const PxU32 numTests,
+			PxU8* baseContactPatches, PxU8* baseContactPoints, PxU8* baseContactForces,
+			PxU32 patchBytesLimit, PxU32 contactBytesLimit, PxU32 forceBytesLimit);
+
+		void testSDKTriMeshHeightfieldGpu(PxgGpuContactManagers& gpuManagers, const PxU32 numTests,
+			PxU8* baseContactPatches, PxU8* baseContactPoints, PxU8* baseContactForces,
+			PxU32 patchBytesLimit, PxU32 contactBytesLimit, PxU32 forceBytesLimit);
+		
+		void testSDKTriMeshTriMeshGpu(PxgGpuContactManagers& gpuManagers,const PxU32 numTests,
+			PxU8* baseContactPatches, PxU8* baseContactPoints, PxU8* baseContactForces,
+			PxU32 patchBytesLimit, PxU32 contactBytesLimit, PxU32 forceBytesLimit);
+
+		void testSDKConvexTriMeshSATGpu(PxgGpuContactManagers& gpuManagers, bool insertAveragePoint, const PxU32 numTests,
+			PxU8* baseContactPatches, PxU8* baseContactPoints, PxU8* baseContactForces,
+			PxU32 patchBytesLimit, PxU32 contactBytesLimit,	PxU32 forceBytesLimit);
+
+		void testSDKConvexHeightfieldGpu(PxgGpuContactManagers& gpuManagers, bool insertAveragePoint, const PxU32 numTests,
+			PxU8* baseContactPatches, PxU8* baseContactPoints, PxU8* baseContactForces,
+			PxU32 patchBytesLimit, PxU32 contactBytesLimit,	PxU32 forceBytesLimit);
+
+		void testSDKParticleSystemGpu(PxgGpuContactManagers& gpuManagers, const PxU32 numTests);
+
+		void testSDKParticleSoftbody(PxgGpuContactManagers& gpuManagers, const PxU32 numTests,
+			PxRenderOutput* renderOutput);
+
+		void testSDKParticleFemCloth(PxgGpuContactManagers& gpuManagers, const PxU32 numTests,
+			PxRenderOutput* renderOutput);
+
+		void testSDKConvexParticle(PxgGpuContactManagers& gpuManagers, const PxU32 numTests);
+
+		void testSDKParticleSdfTriMesh(PxgGpuContactManagers& gpuManagers, const PxU32 numTests);
+
+		void testSDKParticleTriMesh(PxgGpuContactManagers& gpuManagers,const PxU32 numTests);
+
+		void testSDKParticleHeightfield(PxgGpuContactManagers& gpuManagers, const PxU32 numTests);
+
+		void testSDKSoftbody(PxgGpuContactManagers& gpuManagers, const PxU32 numTests, PxRenderOutput* renderOutput);
+
+		void testSDKSoftbodies(PxgGpuContactManagers& gpuManagers, const PxU32 numTests, PxRenderOutput* renderOutput);
+
+		void testSDKSoftbodyCloth(PxgGpuContactManagers& gpuManagers, const PxU32 numTests, PxRenderOutput* renderOutput);
+
+		void testSDKSoftbodyTrimesh(PxgGpuContactManagers& gpuManagers, const PxU32 numTests, PxRenderOutput* renderOutput);
+
+		void testSDKSoftbodySdfTrimesh(PxgGpuContactManagers& gpuManagers, const PxU32 numTests, PxRenderOutput* renderOutput);
+
+		void testSDKSoftbodyHeightfield(PxgGpuContactManagers& gpuManagers, const PxU32 numTests, PxRenderOutput* renderOutput);
+
+		void testSDKFemClothSphere(PxgGpuContactManagers& gpuManagers, const PxU32 numTests, PxRenderOutput* renderOutput);
+
+		void testSDKFemClothPlane(PxgGpuContactManagers& gpuManagers, const PxU32 numTests, PxRenderOutput* renderOutput);
+
+		void testSDKFemClothBox(PxgGpuContactManagers& gpuManagers, const PxU32 numTests, PxRenderOutput* renderOutput);
+
+		void testSDKFemClothConvexes(PxgGpuContactManagers& gpuManagers, const PxU32 numTests, PxRenderOutput* renderOutput);
+		
+		void testSDKFemClothCloth(PxgGpuContactManagers& gpuManagers, const PxU32 numTests, PxRenderOutput* renderOutput);
+
+		void testSDKFemClothSdfTrimesh(PxgGpuContactManagers& gpuManagers, const PxU32 numTests, PxRenderOutput* renderOutput);
+
+		void testSDKFemClothTrimesh(PxgGpuContactManagers& gpuManagers, const PxU32 numTests, PxRenderOutput* renderOutput);
+
+		void testSDKFemClothHeightfield(PxgGpuContactManagers& gpuManagers, const PxU32 numTests, PxRenderOutput* renderOutput);
+		
+		void updateFrictionPatches(PxgGpuContactManagers& gpuManagers, PxU32 count, PxU8* baseContactPatches, PxU8* baseFrictionPatches);
+
+		void fetchNarrowPhaseResults(PxcDataStreamPool* contactStreamPool, PxcDataStreamPool* patchStreamPool, PxcDataStreamPool* forceStreamPool,
+			PxsContactManagerOutput* cmOutputs, const Sc::ShapeInteraction*const* shapeInteractions, const PxReal* restDistances, const PxsTorsionalFrictionData* torsionalData, PxU32 nbFallbackPairs,
+			const PxsContactManagerOutputCounts* foundPatchCounts, const PxsContactManager*const* foundPatchManagers, PxU32 nbFoundPatchManagers);
+
+		void syncNotRigidWithNp();
+		
+		void acquireContext();
+		void releaseContext();
+
+		CUstream getStream();
+
+		void registerContactManager(PxsContactManager* cm, const Sc::ShapeInteraction* shapeInteraction, PxsContactManagerOutput& output, const PxU32 bucketId);
+		void unregisterContactManager(PxsContactManager* manager, const PxU32 bucketId);
+		void refreshContactManager(PxsContactManager* manager, PxsContactManagerOutput* cmOutputs, PxgContactManagerInput& input, const PxU32 bucketId);
+
+		void removeLostPairs();
+		void appendContactManagers(PxsContactManagerOutput* cmOutputs, PxU32 nbFallbackPairs);
+
+		void prepareTempContactManagers();
+
+		template <GPU_BUCKET_ID::Enum>
+		void prepareTempContactManagers();
+
+		void prepareTempContactManagersTasks(Cm::FlushPool& flushPool, PxBaseTask* continuation);
+
+		bool isMeshGPUCompatible(const PxTriangleMeshGeometryLL& meshData);
+
+		bool isClothMeshGPUCompatible(const PxTriangleMeshGeometryLL& meshData); // skipping material counts
+
+		bool isTetMeshGPUCompatible(const Gu::BVTetrahedronMesh* meshData);
+
+		void computeRigidsToShapes();
+
+		void prepareGpuNarrowphase(PxsTransformCache& cache, const PxReal* contactDistances, bool hasContactDistanceChanged);
+
+		PxsContactManagerOutput* getGPUContactManagerOutputBase() { return reinterpret_cast<PxsContactManagerOutput*>(mGpuContactManagers[GPU_BUCKET_ID::eConvex]->mContactManagers.mContactManagerOutputData.getDevicePtr()); }
+
+		PxReal* getGPURestDistances() { return reinterpret_cast<PxReal*>(mGpuContactManagers[GPU_BUCKET_ID::eConvex]->mContactManagers.mRestDistances.getDevicePtr()); }
+
+		PxsTorsionalFrictionData* getGPUTorsionalData() { return reinterpret_cast<PxsTorsionalFrictionData*>(mGpuContactManagers[GPU_BUCKET_ID::eConvex]->mContactManagers.mTorsionalProperties.getDevicePtr()); }
+
+		Sc::ShapeInteraction** getGPUShapeInteractions() { return reinterpret_cast<Sc::ShapeInteraction**>(mGpuContactManagers[GPU_BUCKET_ID::eConvex]->mContactManagers.mShapeInteractions.getDevicePtr()); }
+
+		PxgContactManagers& getExistingContactManagers(GPU_BUCKET_ID::Enum type) { return mContactManagers[type]->mContactManagers; }
+		PxgNewContactManagers& getNewContactManagers(GPU_BUCKET_ID::Enum type) { return mContactManagers[type]->mNewContactManagers; }
+
+		PxgGpuContactManagers& getExistingGpuContactManagers(GPU_BUCKET_ID::Enum type) { return mGpuContactManagers[type]->mContactManagers; }
+		PxgGpuContactManagers& getNewGpuContactManagers(GPU_BUCKET_ID::Enum type) { return mGpuContactManagers[type]->mNewContactManagers; }
+
+
+		PxPinnedArray<PxsContactManagerOutputCounts>& getLostFoundPairsOutput() { return mLostFoundPairsOutputData; }
+		PxPinnedArray<PxsContactManager*>& getLostFoundPairsCms() { return mLostFoundPairsCms; }
+
+		PxgCudaBuffer& getTransformCache()	{ return mGpuTransformCache; }
+		PxU32 getTotalNbLostFoundPairs() const { return mTotalLostFoundPairs; }
+		PxU32 getTotalNbLostFoundPatches() const { return mTotalLostFoundPatches; }
+
+
+		void preallocateNewBuffers(PxU32 nbNewPairs);
+
+		void pushBuffer();
+
+		PxU32 addHull(const Gu::ConvexHullData& hull);
+		PxU32 getHullIdxByHostPtr(const Gu::ConvexHullData* hull);
+		void removeGeometry(PxU32 idx);
+
+		//schedules data copies on the near phase stream
+		void uploadDataChunksToGpu();
+		void waitAndResetCopyQueues();
+
+		//schedules particle material copies on the broad phase stream,
+		//because particle materials are already accessed in pre integreation
+		//- we migth be able to remove this dependency
+	    void uploadDataChunksToGpuBp();
+	    void waitAndResetCopyQueuesBp();
+
+		PxU32 addTriMesh(const Gu::TriangleMesh& mesh);
+		PxU32 getTriMeshIdxByHostPtr(const Gu::TriangleMesh* mesh);
+
+		PxU32 addHeightfield(const Gu::HeightFieldData& hf);
+		PxU32 getHeightfieldIdxByHostPtr(const Gu::HeightFieldData* hf);
+
+		void registerShape(const PxNodeIndex& nodeIndex, const PxsShapeCore& shapeCore, const PxU32 transformCacheID, const bool isFemCloth, PxActor* actor);
+		void updateShapeMaterial(const PxsShapeCore& shapeCore);
+		PxU32 getShapeIndex(const PxsShapeCore& shapeCore);
+		void unregisterShape(const PxsShapeCore& shapeCore, const PxU32 transformCacheID, const bool isFemCloth);
+
+	
+		void registerAggregate(const PxU32 transformCacheID);
+
+		void registerMaterial(const PxsMaterialCore& materialCore);
+		void updateMaterial(const PxsMaterialCore& materialCore);
+		void unregisterMaterial(const PxsMaterialCore& materialCore);
+
+		void registerFEMMaterial(const PxsDeformableSurfaceMaterialCore& materialCore);
+		void updateFEMMaterial(const PxsDeformableSurfaceMaterialCore& materialCore);
+		void unregisterFEMMaterial(const PxsDeformableSurfaceMaterialCore& materialCore);
+
+		void registerFEMMaterial(const PxsDeformableVolumeMaterialCore& materialCore);
+		void updateFEMMaterial(const PxsDeformableVolumeMaterialCore& materialCore);
+		void unregisterFEMMaterial(const PxsDeformableVolumeMaterialCore& materialCore);
+
+		void registerParticleMaterial(const PxsPBDMaterialCore& materialCore);
+		void updateParticleMaterial(const PxsPBDMaterialCore& materialCore);
+		void unregisterParticleMaterial(const PxsPBDMaterialCore& materialCore);
+
+		//direct gpu contact access  
+		bool copyContactData(void* data, PxU32* numContactPairs, const PxU32 maxContactPairs, CUevent startEvent,
+			CUevent finishEvent, PxU8* baseContactPatches, PxU8* baseContactPoints, PxU8* baseContactForces);
+
+		bool evaluateSDFDistances(PxVec4* localGradientAndSDFConcatenated, const PxShapeGPUIndex* shapeIndices, const PxVec4* localSamplePointsConcatenated, const PxU32* samplePointCountPerShape, PxU32 nbElements, PxU32 maxPointCount, CUevent startEvent, CUevent finishEvent);
+
+		void synchronizedStreams(CUstream artiStream);
+
+		PxgShapeManager& getGpuShapeManager() { return mGpuShapesManager;  }
+
+		template <class MaterialData>
+		void mapMaterialIndices(PxU16* gpuMaterialIndices, const PxU16* materialIndices, PxU32 numMaterialIndices);
+
+	private:
+
+		void compactLostFoundPairs(PxgGpuContactManagers& gpuManagers, const PxU32 numTests, PxU32* touchChangeFlags, PxsContactManagerOutput* cmOutputs);
+
+		void softbodyOtherContactApplyCollisionToSimMeshMapping(
+			PxgDevicePointer<float4> contactsd,
+			PxgDevicePointer<float4> barycentricsd,
+			PxgDevicePointer<PxgFemOtherContactInfo> contactInfosd,
+			PxgDevicePointer<PxU32> totalNumCountsd,
+			PxgDevicePointer<PxU32> prevNumCountsd);
+
+	public:
+
+		void softbodyFemContactApplyCollisionToSimMeshMapping(
+			PxgDevicePointer<float4> barycentrics0d,
+			PxgDevicePointer<float4> barycentrics1d,
+			PxgDevicePointer<PxgFemFemContactInfo> contactInfosd,
+			PxgDevicePointer<PxU32> totalNumCountsd,
+			PxgDevicePointer<PxU32> prevNumCountsd, bool isSelfCollision, bool isCloth);
+
+	private:
+
+		void adjustNpIndices(PxgNewContactManagers& newContactManagers, PxPinnedArray<PxgContactManagerInput>& itMainInputs,
+			PxPinnedArray<PxsContactManager*>& itCms, PxPinnedArray<const Sc::ShapeInteraction*>& itSIs,
+			PxFloatArrayPinned& itR, PxPinnedArray<PxsTorsionalFrictionData>& itTor,
+			PxPinnedArray<PxgContactManagerInput>& itNewInputs,
+			PxPinnedArray<PxsContactManager*>& itNewCms,
+			PxPinnedArray<const Sc::ShapeInteraction*>& itNewSIs, PxFloatArrayPinned& itNewR,
+			PxPinnedArray<PxsTorsionalFrictionData>& itNewTor);
+
+		void registerContactManagerInternal(PxsContactManager* cm, const Sc::ShapeInteraction* shapeInteraction, PxgContactManagerInput* input, PxsContactManagerOutput& output, PxgNewContactManagers& newContactManagers);
+
+		void unregisterContactManagerInternal(PxsContactManager* cm, PxInt32ArrayPinned& removedIndices, PxgNewContactManagers& newContactManagers);
+	
+		void refreshContactManagerInternal(PxsContactManager* cm, PxsContactManagerOutput* cmOutputs, const Sc::ShapeInteraction** shapeInteractions, PxgContactManagerInput& input, PxgNewContactManagers& newContactManagers,
+			PxInt32ArrayPinned& removedIndices);
+
+		template <typename Manifold> 
+		void prepareTempContactManagers(PxgGpuContactManagers& gpuManagers, PxgNewContactManagers& newManagers, Manifold* emptyManifold);
+
+		void prepareTempContactManagers(PxgGpuContactManagers& gpuManagers, PxgNewContactManagers& newManagers);
+
+		void removeLostPairsInternal(PxInt32ArrayPinned& removedIndices, PxgContactManagers& contactManagers);
+
+		void prepareTempContactManagersInternal(PxgNewContactManagers& newManagers, Cm::FlushPool& flushPool, PxBaseTask* continuation);
+
+		void drawContacts(PxRenderOutput& out, CUdeviceptr contactsd, CUdeviceptr normalPensd, const PxU32 numContacts);
+
+		template <typename MaterialCore, typename MaterialData>
+		PxU32 registerMaterialInternal(const MaterialCore& materialCore, RefcountedRecordsMap* materialsMap, PxgMaterialManager& materialManager);
+
+		template <typename MaterialCore, typename MaterialData >
+		void updateMaterialInternal(const MaterialCore& materialCore, RefcountedRecordsMap* materialsMap, PxgMaterialManager& materialManager);
+
+		template <typename MaterialData>
+		PxU16 mapMaterialIndex(PxU16 sdkMaterialIndex, RefcountedRecordsMap* materialsMap, PxgMaterialManager& materialManager)
+		{
+			return mapMaterialIndexInternal(sdkMaterialIndex, materialsMap, materialManager, sizeof(MaterialData));
+		}
+
+		PxU16 mapMaterialIndexInternal(PxU16 sdkMaterialIndex, RefcountedRecordsMap* materialsMap,
+			PxgMaterialManager& materialManager, PxU32 materialDataByteSize);
+
+		template <typename MaterialCore>
+		void unregisterMaterialInternal(const MaterialCore& materialCore, RefcountedRecordsMap* materialsMap, PxgMaterialManager& materialManager);
+	
+		void updateContactDistance(const PxReal* contactDistances, const PxU32 numContactDistance);
+	};
+
+	template <>
+	PX_FORCE_INLINE void PxgGpuNarrowphaseCore::mapMaterialIndices<PxsPBDMaterialData>(PxU16* gpuMaterialIndices,
+		const PxU16* materialIndices, PxU32 numMaterialIndices)
+	{
+		PxgGpuNarrowphaseCore::RefcountedRecordsMap* map = mPBDMaterialsMap;
+		PxgPBDMaterialManager& manager = mGpuPBDMaterialManager;
+		for(PxU32 i = 0; i < numMaterialIndices; ++i)
+		{
+			const PxU16 gpuIndex = mapMaterialIndex<PxsPBDMaterialData>(materialIndices[i], map, manager);
+			gpuMaterialIndices[i] = gpuIndex;
+		}
+	}
+}
+
+#endif
--- a/engine/third_party/physx/source/gpunarrowphase/include/PxgNpKernelIndices.h
+++ b/engine/third_party/physx/source/gpunarrowphase/include/PxgNpKernelIndices.h
@@ -0,0 +1,71 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. 
+
+#ifndef PXG_NP_KERNEL_INDICES_H
+#define PXG_NP_KERNEL_INDICES_H
+
+#define MIDPHASE_WARPS_PER_BLOCK              2
+#define NP_TRIMESH_WARPS_PER_BLOCK            2
+#define CORRELATE_WARPS_PER_BLOCK             2
+#define PS_MIDPHASE_COLLISION_WAPRS_PER_BLOCK 1
+#define SB_REFIT_WAPRS_PER_BLOCK              32
+
+namespace physx
+{
+
+	struct PxgNarrowPhaseBlockDims
+	{
+		enum
+		{
+			REMOVE_CONTACT_MANAGERS		= 512,
+			COMPACT_LOST_FOUND_PAIRS    = 512,
+			FINISH_CONTACTS    = 128,
+			EARLY_OUT_KERNEL = 128,
+			COLLIDE_KERNEL = 128,
+			INITIALIZE_MANIFOLDS = 512,
+			COMPRESS_CONTACT = 256
+		};
+	};
+
+	struct PxgNarrowPhaseGridDims
+	{
+		enum
+		{
+			REMOVE_CONTACT_MANAGERS		= 32,
+			COMPACT_LOST_FOUND_PAIRS    = 32,
+			EARLY_OUT_KERNEL = 32,
+			COLLIDE_KERNEL = 32,
+			INITIALIZE_MANIFOLDS = 128,
+			COMPRESS_CONTACT = 32,
+
+		};
+	};
+}
+
+#endif
+
--- a/engine/third_party/physx/source/gpunarrowphase/include/PxgNphaseImplementationContext.h
+++ b/engine/third_party/physx/source/gpunarrowphase/include/PxgNphaseImplementationContext.h
@@ -0,0 +1,305 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PXG_NPHASE_IMPLEMENTATION_CONTEXT_H
+#define PXG_NPHASE_IMPLEMENTATION_CONTEXT_H
+
+#include "CmTask.h"
+#include "foundation/PxMutex.h"
+
+#include "PxvNphaseImplementationContext.h"
+#include "PxgNarrowphaseCore.h"
+#include "PxgSimulationCore.h"
+#include "foundation/PxHashMap.h"
+
+#include "PxsContactManagerState.h"
+#include "PxgContactManager.h"
+#include "PxgPersistentContactManifold.h"
+#include "PxgHeapMemAllocator.h"
+
+namespace physx
+{
+	struct PxcNpWorkUnit;
+	class PxgNphaseImplementationContext;
+	class PxgGpuContext;
+	class PxgParticleSystemCore;
+	class PxsKernelWranglerManager;
+
+	namespace Bp
+	{
+		class BpNonVirtualMemAllocator;
+	}
+
+	namespace Dy
+	{
+		class Context;
+	}
+	
+	class PxgCMGpuDiscreteUpdateBase: public Cm::Task
+	{
+	private:
+		PX_NOCOPY(PxgCMGpuDiscreteUpdateBase)
+
+	public:
+		PxgCMGpuDiscreteUpdateBase(PxgNphaseImplementationContext* context):
+		   Cm::Task(0), mDt(FLT_MAX), mContext(context)
+		{
+		}
+
+		virtual ~PxgCMGpuDiscreteUpdateBase()
+		{}
+
+		void setDt(PxReal dt)
+		{
+			mDt = dt;
+		}
+
+		void processContactManagers(PxgContactManagers& managers, PxgGpuContactManagers& managersGPU, GPU_BUCKET_ID::Enum type);
+
+	protected:
+		PxReal							mDt;		//we could probably retrieve from context to save space?
+		PxgNphaseImplementationContext*	mContext;
+	};
+
+	class PxgCMGpuDiscreteUpdateFallbackTask : public Cm::Task
+	{
+	public:
+		PxgCMGpuDiscreteUpdateFallbackTask(PxgNphaseImplementationContext* context) :
+			Cm::Task(0), mDt(FLT_MAX), mContext(context)
+		{
+		}
+
+		virtual ~PxgCMGpuDiscreteUpdateFallbackTask()
+		{}
+
+		void setDt(PxReal dt)
+		{
+			mDt = dt;
+		}
+
+		virtual const char* getName() const
+		{
+			return "PxgCMGpuDiscreteUpdateFallbackTask";
+		}
+
+		void runInternal();
+
+	protected:
+		PxReal							mDt;
+		PxgNphaseImplementationContext*	mContext;
+	};
+
+	class PxgCMGpuDiscreteUpdateTask : public PxgCMGpuDiscreteUpdateBase
+	{
+		PxBaseTask* mFirstPassNpContinuation;
+		PxBaseTask* mPostBroadPhaseTask;
+	public:
+		PxgCMGpuDiscreteUpdateTask(PxgNphaseImplementationContext* context) : PxgCMGpuDiscreteUpdateBase(context), mFirstPassNpContinuation(NULL)
+		{
+			mPostBroadPhaseTask = NULL;
+		}
+
+		virtual ~PxgCMGpuDiscreteUpdateTask()
+		{
+		}
+
+		virtual void setPostBroadPhaseTask(PxBaseTask* postBroadPhaseTask) 
+		{
+			mPostBroadPhaseTask = postBroadPhaseTask;
+		}
+
+		virtual void release()
+		{
+			if(mFirstPassNpContinuation)
+				mFirstPassNpContinuation->removeReference();
+			PxgCMGpuDiscreteUpdateBase::release();
+		}
+
+		void setFirstPassContinuation(PxBaseTask* firstPassContinuation) { mFirstPassNpContinuation = firstPassContinuation; }
+
+		void runInternal();
+
+		virtual const char* getName() const
+		{
+			return "PxgNphaseImplementationContext.firstContactManagerDiscreteUpdate";
+		}
+	};
+
+	class PxgCMGpuDiscreteSecondPassUpdateTask : public PxgCMGpuDiscreteUpdateBase
+	{
+	public:
+		PxgCMGpuDiscreteSecondPassUpdateTask(PxgNphaseImplementationContext* context): PxgCMGpuDiscreteUpdateBase(context)
+		{
+		}
+
+		virtual ~PxgCMGpuDiscreteSecondPassUpdateTask()
+		{}
+
+		void runInternal();
+
+		virtual const char* getName() const
+		{
+			return "PxgNphaseImplementationContext.secondContactManagerDiscreteUpdate";
+		}
+		
+	protected:
+	};
+
+	class PxgNphaseImplementationContext : public PxvNphaseImplementationContext
+	{
+		private:
+												PX_NOCOPY(PxgNphaseImplementationContext)
+	public:
+		
+		PxgNphaseImplementationContext(PxsContext& context, PxsKernelWranglerManager* gpuKernelWrangler, PxvNphaseImplementationFallback* fallbackForUnsupportedCMs,
+			const PxGpuDynamicsMemoryConfig& gpuDynamicsConfig, void* contactStreamBase, void* patchStreamBase, void* forceAndIndiceStreamBase,
+			PxBoundsArrayPinned& bounds, IG::IslandSim* islandSim, 
+			physx::Dy::Context* dynamicsContext, PxgHeapMemoryAllocatorManager* heapMemoryManager, 
+			bool useGPUNP);
+
+		virtual ~PxgNphaseImplementationContext();
+	
+		virtual void				destroy()	PX_OVERRIDE PX_FINAL;
+		virtual void				updateContactManager(PxReal dt, bool hasContactDistanceChanged, PxBaseTask* continuation, PxBaseTask* firstPassNpContinuation, Cm::FanoutTask* updateBoundAndShapeTask)	PX_OVERRIDE PX_FINAL;
+				void				updateContactManagersFallback(PxReal dt, PxBaseTask* continuation);
+		virtual void				postBroadPhaseUpdateContactManager(PxBaseTask* continuation)	PX_OVERRIDE PX_FINAL;
+		virtual void				secondPassUpdateContactManager(PxReal dt, PxBaseTask* continuation)	PX_OVERRIDE PX_FINAL;
+		virtual void				fetchUpdateContactManager()	PX_OVERRIDE PX_FINAL;
+
+		virtual void				preallocateNewBuffers(PxU32 nbNewPairs, PxU32 maxIndex)	PX_OVERRIDE PX_FINAL;
+
+		virtual void				registerContactManager(PxsContactManager* cm, const Sc::ShapeInteraction* shapeInteraction, PxI32 touching, PxU32 numPatches)	PX_OVERRIDE PX_FINAL;
+		virtual void				unregisterContactManager(PxsContactManager* cm)	PX_OVERRIDE PX_FINAL;
+		virtual void				refreshContactManager(PxsContactManager* cm)	PX_OVERRIDE PX_FINAL;
+
+		virtual void				registerShape(const PxNodeIndex& nodeIndex, const PxsShapeCore& shapeCore, const PxU32 transformCacheID, PxActor* actor, const bool isFemCloth)	PX_OVERRIDE PX_FINAL;
+		virtual void				updateShapeMaterial(const PxsShapeCore& shapeCore)	PX_OVERRIDE PX_FINAL;
+		virtual void				unregisterShape(const PxsShapeCore& shapeCore, const PxU32 transformCacheID, const bool isFemCloth)	PX_OVERRIDE PX_FINAL;
+
+		virtual void				registerAggregate(const PxU32 transformCacheID)	PX_OVERRIDE PX_FINAL;
+
+		virtual void				registerMaterial(const PxsMaterialCore& materialCore)	PX_OVERRIDE PX_FINAL;
+		virtual void				updateMaterial(const PxsMaterialCore& materialCore)	PX_OVERRIDE PX_FINAL;
+		virtual void				unregisterMaterial(const PxsMaterialCore& materialCore)	PX_OVERRIDE PX_FINAL;
+
+		virtual void				registerMaterial(const PxsDeformableSurfaceMaterialCore& materialCore)	PX_OVERRIDE PX_FINAL;
+		virtual void				updateMaterial(const PxsDeformableSurfaceMaterialCore& materialCore)	PX_OVERRIDE PX_FINAL;
+		virtual void				unregisterMaterial(const PxsDeformableSurfaceMaterialCore& materialCore)	PX_OVERRIDE PX_FINAL;
+
+		virtual void				registerMaterial(const PxsDeformableVolumeMaterialCore& materialCore)	PX_OVERRIDE PX_FINAL;
+		virtual void				updateMaterial(const PxsDeformableVolumeMaterialCore& materialCore)	PX_OVERRIDE PX_FINAL;
+		virtual void				unregisterMaterial(const PxsDeformableVolumeMaterialCore& materialCore)	PX_OVERRIDE PX_FINAL;
+
+		virtual void				registerMaterial(const PxsPBDMaterialCore& materialCore)	PX_OVERRIDE PX_FINAL;
+		virtual void				updateMaterial(const PxsPBDMaterialCore& materialCore)	PX_OVERRIDE PX_FINAL;
+		virtual void				unregisterMaterial(const PxsPBDMaterialCore& materialCore)	PX_OVERRIDE PX_FINAL;
+
+		virtual PxsContactManagerOutput& getNewContactManagerOutput(PxU32 npId)	PX_OVERRIDE PX_FINAL;
+
+		virtual void				appendContactManagers()	PX_OVERRIDE PX_FINAL;
+
+		virtual PxsContactManagerOutputIterator	getContactManagerOutputs()	PX_OVERRIDE PX_FINAL;
+
+		virtual void				setContactModifyCallback(PxContactModifyCallback* callback) PX_OVERRIDE PX_FINAL	{ mFallbackForUnsupportedCMs->setContactModifyCallback(callback); }
+
+		void						updateNarrowPhaseShape();
+
+		void						mergeContactManagers(PxBaseTask* continuation);
+
+		void						processResults();
+
+		virtual	void				startNarrowPhaseTasks()	PX_OVERRIDE PX_FINAL;
+
+		PxReal						getToleranceLength();
+
+		PxgGpuNarrowphaseCore*		getGpuNarrowphaseCore() { return mGpuNarrowphaseCore; }
+
+		PxgSimulationCore*			getSimulationCore() { return mGpuSimulationCore; }
+		void						setSimulationCore(PxgSimulationCore* sc) { mGpuSimulationCore = sc; }
+
+		virtual void				acquireContext()	PX_OVERRIDE PX_FINAL;
+		virtual void				releaseContext()	PX_OVERRIDE PX_FINAL;
+
+		virtual void				lock()		PX_OVERRIDE PX_FINAL	{ mFallbackForUnsupportedCMs->lock();}
+		virtual void				unlock()	PX_OVERRIDE PX_FINAL	{ mFallbackForUnsupportedCMs->unlock(); }
+	
+		PxsContext&								getContext() { return mContext; }
+
+		virtual PxsContactManagerOutputCounts*	getLostFoundPatchOutputCounts()	PX_OVERRIDE PX_FINAL;
+		virtual PxsContactManager**				getLostFoundPatchManagers()		PX_OVERRIDE PX_FINAL;
+		virtual PxU32							getNbLostFoundPatchManagers()	PX_OVERRIDE PX_FINAL;
+
+		virtual PxsContactManagerOutput*		getGPUContactManagerOutputBase()	PX_OVERRIDE PX_FINAL;
+		virtual PxReal*							getGPURestDistances()				PX_OVERRIDE PX_FINAL;
+		virtual Sc::ShapeInteraction**			getGPUShapeInteractions()			PX_OVERRIDE PX_FINAL;
+		virtual PxsTorsionalFrictionData*		getGPUTorsionalData()				PX_OVERRIDE PX_FINAL;
+
+	protected:
+
+		void						prepareTempContactManagers(PxBaseTask* continuation);
+
+		void						removeLostPairs();
+
+		void						registerContactManagerInternal(PxsContactManager* cm, const PxcNpWorkUnit& workUnit, PxU32 patchCount, PxI32 touching,
+																   const Sc::ShapeInteraction* shapeInteraction, GPU_BUCKET_ID::Enum bucketId);
+			
+		PxvNphaseImplementationFallback*		mFallbackForUnsupportedCMs;				
+
+		//those arrays logically belongs to the task, but we keep it here to avoid reallocations each frame
+		friend class PxgCMGpuDiscreteUpdateBase;
+		friend class PxgCMGpuDiscreteUpdateTask;
+		friend class PxgCMGpuDiscreteSecondPassUpdateTask;
+		friend class PxgCMGpuDiscreteUpdateConvexTriMeshTask;
+		friend class PxgCMGpuDiscreteSecondPassUpdateConvexTriMeshTask;
+		
+		PxgCMGpuDiscreteUpdateTask				mUpdateCMsFirstPassTask;
+		PxgCMGpuDiscreteSecondPassUpdateTask	mUpdateCMsSecondPassTask;
+		PxgCMGpuDiscreteUpdateFallbackTask		mUpdateFallbackPairs;
+
+		PxPinnedArray<PxsContactManagerOutput>	mContactManagerOutputs;
+		
+		PxBitMap								mGpuContactManagerBitMap[GPU_BUCKET_ID::eCount];
+		PxU32									mRecordedGpuPairCount[GPU_BUCKET_ID::eCount];
+		PxU32									mNbPairCount[GPU_BUCKET_ID::eCount];
+
+		PxgGpuNarrowphaseCore*					mGpuNarrowphaseCore;
+		PxgSimulationCore*						mGpuSimulationCore;
+		PxgGpuContext*							mGpuDynamicContext;
+
+		PxU32									mTotalNbPairs;
+
+		PxBoundsArrayPinned&					mBounds;
+
+		bool									mHasContactDistanceChanged;
+		bool									mUseGPUBP;
+
+		PxU32									mMaxPatches;
+	};
+}
+
+#endif
--- a/engine/third_party/physx/source/gpunarrowphase/include/PxgPersistentContactManifold.h
+++ b/engine/third_party/physx/source/gpunarrowphase/include/PxgPersistentContactManifold.h
@@ -0,0 +1,98 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. 
+
+#ifndef PXG_PERSISTENT_CONTACT_MANIFOLD_H
+#define PXG_PERSISTENT_CONTACT_MANIFOLD_H
+
+#include "cutil_math.h"
+#include "AlignedTransform.h"
+
+#define PXG_MAX_PCM_CONTACTS 4
+
+#define PXG_MULTIMANIFOLD_MAX_SUBMANIFOLDS	4
+#define PXG_SUBMANIFOLD_MAX_CONTACTS		6
+
+namespace physx
+{
+	struct PX_ALIGN_PREFIX(16) PxgPersistentContactManifold
+	{
+		float4 mLocalNormal_PenW[PXG_MAX_PCM_CONTACTS];			//local normal + penetration
+		float4 mLocalContactA[PXG_MAX_PCM_CONTACTS];			//local contact in body A's space
+		float4 mLocalContactB[PXG_MAX_PCM_CONTACTS];			//local contact in body B's space
+		float4 mRelativePos;									//Cached relative transforms
+		PxAlignedQuat mQuatA;									//Cached body A's quatenion
+		PxAlignedQuat mQuatB;									//Cached body B's quatenion
+		PxU32 mNbContacts;										//NbContacts
+		PxU32 mNbWarmStartPoints;								//Nb warm start points
+		PxU32 mWarmStartA;										//GJK warm-start mask for shape A
+		PxU32 mWarmStartB;										//GJK warm-start mask for shape B
+
+		PX_FORCE_INLINE void clear()
+		{
+			mNbContacts = 0;
+			mNbWarmStartPoints = 0;
+			mRelativePos = make_float4(PX_MAX_REAL, PX_MAX_REAL, PX_MAX_REAL, PX_MAX_REAL);
+			mQuatA = PxAlignedQuat(0.f, 0.f, 0.f, 1.f);
+			mQuatB = PxAlignedQuat(0.f, 0.f, 0.f, 1.f);
+		}
+
+		//KS - TODO - extend and add functionality as is required...
+	}
+	PX_ALIGN_SUFFIX(16);
+
+	struct PX_ALIGN_PREFIX(16) PxgContact
+	{
+		   PxVec3 pointA;
+		   PxVec3 pointB;
+		   PxVec3 normal;
+		   PxReal penetration;
+		   PxU32 triIndex;
+		   PxU32 pad;
+	} // 48 due to padding
+	PX_ALIGN_SUFFIX(16);
+
+
+	struct PX_ALIGN_PREFIX(16) PxgPersistentContactMultiManifold
+	{
+		PxgContact mContacts[PXG_MULTIMANIFOLD_MAX_SUBMANIFOLDS][PXG_SUBMANIFOLD_MAX_CONTACTS];	// 1152
+		PxU32 mNbContacts[PXG_MULTIMANIFOLD_MAX_SUBMANIFOLDS];									// 1168
+		PxAlignedTransform mRelativeTransform;													// 1200
+		PxU32 mNbManifolds;																		// 1216 due to padding
+		
+
+		PX_FORCE_INLINE void clear()
+		{
+			mNbManifolds = 0;
+			mRelativeTransform.p = make_float4(PX_MAX_REAL, PX_MAX_REAL, PX_MAX_REAL, PX_MAX_REAL);
+			mRelativeTransform.q = PxAlignedQuat(0.f, 0.f, 0.f, 1.f);
+		}
+	}
+	PX_ALIGN_SUFFIX(16);
+}
+
+#endif
--- a/engine/third_party/physx/source/gpunarrowphase/include/PxgShapeManager.h
+++ b/engine/third_party/physx/source/gpunarrowphase/include/PxgShapeManager.h
@@ -0,0 +1,163 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PXG_SHAPE_MANAGER_H
+#define PXG_SHAPE_MANAGER_H
+
+#include "foundation/PxPreprocessor.h"
+#include "PxSceneDesc.h"
+#include "foundation/PxPinnedArray.h"
+#include "CmIDPool.h"
+#include "PxgCudaBuffer.h"
+#include "foundation/PxBitMap.h"
+#include "PxsMaterialCore.h"
+#include "PxsDeformableSurfaceMaterialCore.h"
+#include "PxsDeformableVolumeMaterialCore.h"
+#include "PxsPBDMaterialCore.h"
+#include "PxgConvexConvexShape.h"
+
+namespace physx
+{
+	class PxCudaContext;
+	class PxgHeapMemoryAllocatorManager;
+	class PxgCopyManager;
+	class PxNodeIndex;
+
+	class PxgShapeManager
+	{
+	public:
+		PxgShapeManager(PxgHeapMemoryAllocatorManager* heapManager);
+		~PxgShapeManager(){}
+
+
+		//this method push CopyDesc to PxgCopyManager
+		void initialize(PxCudaContext* cudaContext, CUstream stream);
+		void scheduleCopyHtoD(PxgCopyManager& copyManager, PxCudaContext* cudaContext, CUstream stream);
+
+		PxU32 registerShape(PxgShape& shape);
+		void registerShapeInstance(const PxNodeIndex& nodeIndex, const PxU32 transformCacheID, PxActor* actor, bool aggregate = false);
+		void unregisterShape(const PxU32 shapeId);
+		void unregisterShapeInstance(const PxU32 transformCacheID);
+
+		void updateShapeMaterial(const PxU32 materalIndex, const PxU32 id);
+
+
+		void releaseIDs() { mIdPool.processDeferredIds(); }
+
+		Cm::DeferredIDPool				mIdPool;
+		PxgHeapMemoryAllocatorManager*	mHeapManager;
+		PxPinnedArray<PxgShape>			mHostShapes;
+		PxPinnedArray<PxNodeIndex>		mHostShapesRemapTable;//remap table from shape to node index
+		PxInt32ArrayPinned				mHostShapeIdTable;
+		PxPinnedArray<PxActor*>			mHostTransformCacheIdToActorTable; // remap table from transformCacheId to PxActor
+		PxgCudaBuffer					mGpuShapesBuffer;
+		PxgCudaBuffer					mGpuShapesRemapTableBuffer;	// shape to rigid
+		PxgCudaBuffer					mGpuTransformCacheIdToActorTableBuffer; // transformCacheId to PxActor
+		PxgCudaBuffer					mGpuRigidIndiceBuffer; // 00011 nodeIndex 0 has shape(0, 2, 3), nodeIndex 1 has shape(1, 4)(it can be either articulation link or rigid body)
+		PxgCudaBuffer					mGpuShapeIndiceBuffer; // 02314
+		PxgCudaBuffer					mGpuUnsortedShapeIndicesBuffer; // needed when recomputing the shape to rigids mapping
+		PxgCudaBuffer					mGpuTempRigidBitIndiceBuffer; //either the lower 32 bits or higher 32 bits of mGpuRigidIndiceBuffer
+		PxgCudaBuffer					mGpuTempRigidIndiceBuffer;
+		PxBitMap						mDirtyShapeMap;
+		PxBitMap						mDirtyTransformCacheMap;
+		bool							mResizeRequired;
+		bool							mTransformCacheResizeRequired;
+		PxI32							mMaxShapeId;
+		PxI32							mMaxTransformCacheID;
+		bool							mHasShapeChanged;
+		bool							mHasShapeInstanceChanged;
+
+	private:
+		PX_NOCOPY(PxgShapeManager)
+	};
+
+	class PxgMaterialManager
+	{
+		PX_NOCOPY(PxgMaterialManager)
+	public:
+		PxgMaterialManager(PxgHeapMemoryAllocatorManager* heapManager, const PxU32 elemSize = sizeof(PxsMaterialData));
+		virtual ~PxgMaterialManager(){}
+
+		//this method push CopyDesc to PxgCopyManager
+		virtual void scheduleCopyHtoD(PxgCopyManager& copyManager, PxCudaContext* cudaContext, CUstream stream, const PxU32 elemSize);
+
+		PxU32 registerMaterial(const PxU8* materialData, const PxU32 elemSize);
+		void updateMaterial(const PxU8* materialData, const PxU32 elemSize, const PxU32 id);
+		
+		void unregisterMaterial(const PxU32 materialId);
+		void releaseIDs() { mIdPool.processDeferredIds(); }
+
+		PxgCudaBuffer					mGpuMaterialBuffer;
+
+	protected:
+		Cm::DeferredIDPool				mIdPool;
+		PxgHeapMemoryAllocatorManager*	mHeapManager;
+		PxInt8ArrayPinned				mHostMaterial;
+		PxBitMap						mDirtyMaterialMap;
+		bool							mResizeRequired;
+	};
+	
+	class PxgFEMMaterialManager : public PxgMaterialManager
+	{
+	public:
+
+		PxgFEMMaterialManager(PxgHeapMemoryAllocatorManager* heapManager, const PxU32 elemSize);
+
+		virtual void scheduleCopyHtoD(PxgCopyManager& copyManager, PxCudaContext* cudaContext, CUstream stream, const PxU32 elemSize);
+	};
+
+	class PxgFEMSoftBodyMaterialManager : public PxgFEMMaterialManager
+	{
+	public:
+		PxgFEMSoftBodyMaterialManager(PxgHeapMemoryAllocatorManager* heapManager);
+	};
+
+	class PxgFEMClothMaterialManager : public PxgMaterialManager
+	{
+	public:
+	    PxgFEMClothMaterialManager(PxgHeapMemoryAllocatorManager* heapManager) :
+			PxgMaterialManager(heapManager, sizeof(PxsDeformableSurfaceMaterialData))
+	    {
+	    }
+	    PxsDeformableSurfaceMaterialData* getCPUMaterialData()
+	    {
+		    return reinterpret_cast<PxsDeformableSurfaceMaterialData*>(mHostMaterial.begin());
+	    }
+	};
+
+	class PxgPBDMaterialManager : public PxgMaterialManager
+	{
+	public:
+		PxgPBDMaterialManager(PxgHeapMemoryAllocatorManager* heapManager) :
+			PxgMaterialManager(heapManager, sizeof(PxsPBDMaterialData))
+		{
+		}
+	};
+
+}
+#endif
--- a/engine/third_party/physx/source/gpunarrowphase/include/convexFormat.h
+++ b/engine/third_party/physx/source/gpunarrowphase/include/convexFormat.h
@@ -0,0 +1,137 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef CONVEX_FORMAT_H
+#define CONVEX_FORMAT_H
+
+#include "foundation/PxVec3.h"
+#include "foundation/PxVec4.h"
+#include "foundation/PxBounds3.h"
+#include "foundation/PxPlane.h"
+#include "foundation/PxMat44.h"
+
+#include "foundation/PxMemory.h"
+#include "foundation/PxUtilities.h"
+
+namespace physx
+{
+
+struct ConvexHullCooked
+{
+	struct Valency
+	{
+		PxU16		mCount;
+		PxU16		mOffset;
+	};
+
+	struct BigConvexRawData
+	{
+		// Support vertex map
+		PxU16		mSubdiv;		// "Gaussmap" subdivision
+		PxU16		mNbSamples;		// Total #samples in gaussmap PT: this is not even needed at runtime!
+
+		PxU8*		mSamples; //mNbSamples * 2 elements
+		PX_FORCE_INLINE const PxU8*	getSamples2()	const
+		{
+			return mSamples + mNbSamples;
+		}
+		//~Support vertex map
+
+		// Valencies data
+		PxU32			mNbVerts;		//!< Number of vertices
+		PxU32			mNbAdjVerts;	//!< Total number of adjacent vertices  ### PT: this is useless at runtime and should not be stored here
+		Valency*		mValencies;		//!< A list of mNbVerts valencies (= number of neighbors) mNbVerts elements
+		PxU8*			mAdjacentVerts;	//!< List of adjacent vertices
+		//~Valencies data
+	};
+
+	struct InternalObjectsData
+	{
+		PxReal	mRadius;
+		PxReal	mExtents[3];
+
+		PX_FORCE_INLINE	void reset()
+		{
+			mRadius = 0.0f;
+			mExtents[0] = 0.0f;
+			mExtents[1] = 0.0f;
+			mExtents[2] = 0.0f;
+		}
+	};
+
+	struct HullPolygon
+	{
+		PxPlane	mPlane;			//!< Plane equation for this polygon	//Could drop 4th elem as it can be computed from any vertex as: d = - p.dot(n);
+		PxU16	mVRef8;			//!< Offset of vertex references in hull vertex data (CS: can we assume indices are tightly packed and offsets are ascending?? DrawObjects makes and uses this assumption)
+		PxU8	mNbVerts;		//!< Number of vertices/edges in the polygon
+		PxU8	mMinIndex;		//!< Index of the polygon vertex that has minimal projection along this plane's normal.
+
+		PX_FORCE_INLINE	PxReal getMin(const PxVec3* PX_RESTRICT hullVertices) const	//minimum of projection of the hull along this plane normal
+		{ 
+			return mPlane.n.dot(hullVertices[mMinIndex]);
+		}
+
+		PX_FORCE_INLINE	PxReal getMax() const		{ return -mPlane.d; }	//maximum of projection of the hull along this plane normal
+	};
+
+	PxBounds3			mAABB;				
+	PxVec3				mCenterOfMass;		
+
+	PxU16				mNbEdges;
+	PxU8				mNbHullVertices;	
+	PxU8				mNbPolygons;		
+
+	HullPolygon*		mPolygons;			//!< Array of mNbPolygons structures
+	PxVec3*				mVertices;			//mNbHullVertices elements
+
+	PxU8*				mFacesByEdges8;		//mNbEdges * 2 elements
+	PxU8*				mFacesByVertices8;	//mNbHullVertices * 3 elements
+	PxU16*				mVerticesByEdges16; //mNbEdges * 2 elements
+	PxU8*				mVertexData8;		//G-d knows how many elements
+
+	BigConvexRawData*	mBigConvexRawData;	//!< Hill climbing data, only for large convexes! else NULL.
+
+	InternalObjectsData	mInternal;
+
+	bool isGpuFriendly()
+	{
+		bool ret = mNbHullVertices <= 64;
+
+		if(!ret)
+			return false;
+
+		for(PxU32 i = 0; i < mNbPolygons; ++i)
+			ret &= mPolygons[i].mNbVerts <= 31;
+
+		return ret;
+	}
+};
+
+}
+
+#endif
--- a/engine/third_party/physx/source/gpunarrowphase/include/convexNpCommon.h
+++ b/engine/third_party/physx/source/gpunarrowphase/include/convexNpCommon.h
@@ -0,0 +1,306 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+
+#ifndef __CONVEXNPCOMMON_H__
+#define __CONVEXNPCOMMON_H__
+
+#define NUM_TMP_CONTACTS_PER_PAIR       32
+
+#define CVX_TRI_MAX_CONTACTS            5
+
+#define MAX_MESH_MESH_PATCHES           128
+
+// Patches correlation - same normal threshold
+// 8 degrees
+//#define PATCH_ACCEPTANCE_EPS          0.990268f
+// 5 degrees
+#define PATCH_ACCEPTANCE_EPS            0.996194698f
+// 3 degrees
+//#define PATCH_ACCEPTANCE_EPS          0.99862953f
+
+#define CONVEX_TRIMESH_CACHED           0xFFffFFff
+
+// AD: used to specify sizes of shared memory structs
+// for collisions involving convexes. Make sure you
+// know what you are doing if you change this.
+#define CONVEX_MAX_VERTICES_POLYGONS    64
+
+#include "foundation/PxPreprocessor.h"
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxVec3.h"
+#include "foundation/PxQuat.h"
+#include "foundation/PxTransform.h"
+
+#include "geometry/PxMeshScale.h"
+
+#include "GuBV32.h"
+
+#include "PxgCommonDefines.h"
+
+#include <vector_types.h>
+
+namespace physx
+{
+	struct ConvexMeshPair
+	{
+		physx::PxTransform aToB;		// pB = aToB.transform(pA + normal * separation)										28
+		int startIndex;			// start index in the ConvexTriContacts and ConvexTriPairs arrays								32
+		int count;				// the number of triangles in the ConvexTriContacts and ConvexTriPairs							36
+		int cmIndex;			// index of the original CM, to index into the output buffer and pass on to contact reporting	40
+		int roundedStartIndex;	// start index in the sorted triangle index array. For each pair, the number of indices			44 
+								// assigned to is the number of triangles padded to a multiple of 4 and duplicated to have
+								// space for the temp buffer for the radix sort. The start index is aligned to a mutliple of 4.
+		int pad;				// pad																							48
+		uint2 materialIndices;	// material index for shape0 and shape1															56
+	};
+
+	struct ConvexTriNormalAndIndex
+	{
+		static const PxU32 DeferredContactMask = 0x80000000;
+		static const PxU32 NbContactsShift = 27;
+		physx::PxVec3 normal;		// normal
+		int index;					// isDeferredContact is in bit 31, nbContacts is in bits 27-30, rest is cpu mesh index for the triangle
+
+		static PX_CUDA_CALLABLE PX_FORCE_INLINE PxU32 getTriangleIndex(PxU32 index)
+		{
+			return (index & ((1<<NbContactsShift)-1));
+		}
+
+		static PX_CUDA_CALLABLE PX_FORCE_INLINE PxU32 getNbContacts(PxU32 index)
+		{
+			return (index & (~DeferredContactMask)) >> NbContactsShift;
+		}
+
+		static PX_CUDA_CALLABLE PX_FORCE_INLINE PxU32 isDeferred(const PxU32 index)
+		{
+			return index & DeferredContactMask;
+		}
+	};
+
+	struct ConvexTriContacts
+	{
+		PxU32 index;
+	};
+
+	struct ConvexTriContact
+	{
+		float4 contact_sepW;
+	};
+
+	//each convex triangle pairs will generate the intermediate data in case we need to
+	//do post process of this pair
+	struct ConvexTriIntermediateData
+	{
+		//Enums to indicate the barycentric weighting of the contacts
+		enum VoronoiState: PxU32
+		{
+			eV0 = 1u << 29,
+			eV1 = 2u << 29,
+			eV2 = 3u << 29,
+			eE01 = 4u << 29,
+			eE02 = 5u << 29,
+			eE12 = 6u << 29,
+			eMASK = 0xe0000000
+		};
+
+		physx::PxU32 gpuTriIndex;				//gpu triangle index
+
+		static PX_CUDA_CALLABLE PX_FORCE_INLINE bool isEdgeContact(const PxU32 mask)
+		{
+			return mask & 0x80000000;
+		}
+	};
+
+	typedef ConvexMeshPair SphereMeshPair;
+	typedef ConvexTriContacts SphereTriContacts;
+	typedef ConvexTriNormalAndIndex SphereTriNormalAndIndex;
+	typedef ConvexTriIntermediateData SphereTriIntermediateData;
+}
+
+
+#if PX_CUDA_COMPILER
+
+#include "PxgCommonDefines.h"
+
+namespace physx
+{
+
+	PX_ALIGN_PREFIX(16)
+	struct MidphaseScratch
+	{
+		const float4 * PX_RESTRICT trimeshVerts;
+		const uint4 * PX_RESTRICT trimeshTriIndices;
+		PxTransform shapeToMeshNoScale;
+		PxTransform meshToWorld;
+		PxMeshScale trimeshScale;
+		PxVec3 inflatedExtents;
+		PxVec3 center;
+		PxU32 shape_materialIndex;
+		PxU32 trimeshShape_materialIndex;
+		
+		//bv32 tree
+		const Gu::BV32DataPacked* bv32PackedNodes;
+
+		//stack for traversal
+		int sBv32Nodes[320]; //10 depth of the bv32 tree
+	}PX_ALIGN_SUFFIX(16);
+	PX_COMPILE_TIME_ASSERT(sizeof(MidphaseScratch) <= WARP_SIZE * 16 * sizeof(PxU32));
+}
+
+#include "schlockShared.h"
+
+//0 is primitive, 1 is tetrahedron world space
+struct TetCollideScratch
+{
+	physx::PxVec3 vA[CONVEX_MAX_VERTICES_POLYGONS];
+	physx::PxVec3 vB[4];
+	schlock::GjkCachedData cachedData;
+	schlock::GjkOutput gjkOutput;
+
+	physx::PxQuat rot0;
+	physx::PxVec3 scale0;
+
+	physx::PxReal contactDistance;
+	physx::PxReal inSphereRadius0;
+	physx::PxReal inSphereRadius1;
+
+	physx::PxVec3 searchDir;
+
+	physx::PxU8 nbVertices0;
+	physx::PxU8 nbVertices1;
+
+	physx::PxU32 tetIndex0;
+	physx::PxU32 tetIndex1;
+
+	__device__ void Clear()
+	{
+		if (threadIdx.x == 0)
+			cachedData.size = 0;
+		__syncwarp();
+	}
+};
+
+#include "epa.cuh"
+#include "gjk.cuh"
+
+__device__ static schlock::GjkResult::Enum tetPrimitivesCollide2(
+		squawk::EpaScratch& ss_epa_scratch,
+		TetCollideScratch& ss_scratch,
+		const physx::PxU32 globalWarpIndex)
+{
+	typedef schlock::GjkResult GjkResult;
+
+	const physx::PxReal convergenceRatio = 1 - 0.000225f;
+
+	assert(ss_scratch.nbVertices0 <= 64);
+	assert(ss_scratch.nbVertices1 <= 64);
+
+	physx::PxVec3 initialDir(1, 0, 0);
+
+	physx::PxVec3 aToB_p = ss_scratch.searchDir; // ss_scratch.aToB.p;
+
+	if (aToB_p.magnitudeSquared() > 0)
+		initialDir = aToB_p; // GJK's initial start dir is usually from the warm cache => lazy normalize inside GJK
+
+	GjkResult::Enum gjkResult = squawk::gjk(
+		ss_scratch.vA, ss_scratch.nbVertices0,
+		ss_scratch.vB, ss_scratch.nbVertices1,
+		initialDir,
+		ss_scratch.contactDistance,
+		convergenceRatio,
+		ss_scratch.gjkOutput, ss_scratch.cachedData);
+
+	__syncwarp();
+
+	if (gjkResult == GjkResult::eDISTANT)
+	{
+		return GjkResult::eDISTANT;
+	}
+
+	bool separated = (gjkResult == GjkResult::eCLOSE);
+	bool anomaly = separated && ss_scratch.gjkOutput.closestPointDir.dot(ss_scratch.gjkOutput.direction) < 0.999f;
+
+	GjkResult::Enum result = GjkResult::eCLOSE;
+
+	if (!separated || anomaly)
+	{
+		GjkResult::Enum epaResult = squawk::epa(ss_epa_scratch,
+			ss_scratch.vA, ss_scratch.nbVertices0,
+			ss_scratch.vB, ss_scratch.nbVertices1,
+			&(ss_scratch.cachedData),
+			convergenceRatio,
+			0.5f * (ss_scratch.inSphereRadius0 + ss_scratch.inSphereRadius1),
+			ss_scratch.gjkOutput);
+
+		//separated = epaResult == GjkResult::eCLOSE;
+
+		result = epaResult;
+
+		__syncwarp();
+
+		if (ss_scratch.gjkOutput.degenerate)
+		{
+			//we need to re-run gjk epa with other configurations
+			ss_scratch.cachedData.size = 0;
+
+			//we need to re-run gjk epa with other configurations
+			initialDir = physx::PxVec3(0.f, 1.f, 0.f);
+
+			GjkResult::Enum gjkResult = squawk::gjk(
+				ss_scratch.vA, ss_scratch.nbVertices0,
+				ss_scratch.vB, ss_scratch.nbVertices1,
+				initialDir,
+				ss_scratch.contactDistance,
+				convergenceRatio,
+				ss_scratch.gjkOutput, ss_scratch.cachedData);
+
+			__syncwarp();
+
+			GjkResult::Enum epaResult = squawk::epa(ss_epa_scratch,
+				ss_scratch.vA, ss_scratch.nbVertices0,
+				ss_scratch.vB, ss_scratch.nbVertices1,
+				&(ss_scratch.cachedData),
+				convergenceRatio,
+				0.5f * (ss_scratch.inSphereRadius0 + ss_scratch.inSphereRadius1),
+				ss_scratch.gjkOutput);
+
+			//separated = epaResult == GjkResult::eCLOSE;
+
+			result = epaResult;
+		}
+		__syncwarp();
+	}
+	return result;
+}
+
+
+
+#endif
+
+#endif
--- a/engine/third_party/physx/source/gpunarrowphase/include/cudaNpCommon.h
+++ b/engine/third_party/physx/source/gpunarrowphase/include/cudaNpCommon.h
@@ -0,0 +1,424 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef CUDA_NP_COMMON_H
+#define CUDA_NP_COMMON_H
+
+#include "foundation/PxVec3.h"
+#include <stdio.h>
+#include "convexFormat.h"
+#include "cutil_math.h"
+#include "PxgCommonDefines.h"
+#include "PxgPersistentContactManifold.h"
+#include "assert.h"
+
+#define CONVEX_PLANE_WARPS_PER_BLOCK 4
+#define GJK_EPA_WARPS_PER_BLOCK 1
+#define NUM_TMP_CONTACTS_PER_PAIR 32
+
+
+//The smallest epsilon we will permit (scaled by PxTolerancesScale.length)
+#define	PCM_WITNESS_POINT_LOWER_EPS		1e-2f
+//The largest epsilon we will permit (scaled by PxTolerancesScale.length)
+#define	PCM_WITNESS_POINT_UPPER_EPS		5e-2f
+
+namespace physx
+{
+	struct PxgPersistentContactManifold;
+	struct PxgPersistentContactMultiManifold;
+	struct PxgContactManagerInput;
+	struct PxsContactManagerOutput;
+	class PxsContactManager;
+	struct PxsTorsionalFrictionData;
+
+namespace Sc
+{
+	class ShapeInteraction;
+}
+
+
+	struct PxgPairManagementData
+	{
+		PxgContactManagerInput*			mContactManagerInputData;
+		PxsContactManagerOutput*		mContactManagerOutputData;
+		PxsContactManager**				mCpuContactManagerMapping;
+		Sc::ShapeInteraction**			mShapeInteractions;
+		PxReal*							mRestDistances;
+		PxsTorsionalFrictionData*		mTorsionalData;
+		PxU32*							mTempAccumulator;
+		PxU32*							mBlockSharedAccumulator;
+		PxU32*							mRemoveIndices;
+		PxU32							mNbPairs;
+		PxU32							mNbToRemove;
+
+		void*							mPersistentContactManagers;// either PxgPersistentContactManifold or PxgPersistentMultiManagementData
+
+		PxgPairManagementData() : mContactManagerInputData(NULL), mContactManagerOutputData(NULL), mCpuContactManagerMapping(NULL), mShapeInteractions(NULL), 
+			mRestDistances(NULL), mTorsionalData(NULL), mTempAccumulator(NULL), mBlockSharedAccumulator(NULL), mNbPairs(0)
+		{
+		}
+	};
+
+
+	struct PX_ALIGN_PREFIX(16) PxgPatchAndContactCounters
+	{
+		enum OverflowError
+		{
+			NO_OVERFLOW = 0,
+			CONTACT_BUFFER_OVERFLOW = 1 << 0,
+			FORCE_BUFFER_OVERFLOW = 1 << 1,
+			PATCH_BUFFER_OVERFLOW = 1 << 2
+		};
+
+		PxU32 patchesBytes;
+		PxU32 contactsBytes;
+		PxU32 forceAndIndiceBytes;
+		PxU32 overflowError;
+
+		__device__
+			void setOverflowError(const OverflowError& err)
+		{
+			PX_UNUSED(err);
+#if __CUDA_ARCH__
+			atomicOr(&overflowError, PxU32(err));
+#endif
+		}
+
+		__host__ __device__
+			PxU32 getOverflowError() const
+		{
+			return overflowError;
+		}
+	} PX_ALIGN_SUFFIX(16);
+
+}
+
+
+__host__ __device__ inline
+physx::ConvexHullCooked::Valency u32ToValency(physx::PxU32 countOffset)
+{
+	physx::ConvexHullCooked::Valency v;
+	v.mCount = u32High(countOffset);
+	v.mOffset = u32Low(countOffset);
+	
+	return v;
+}
+
+__host__ __device__ inline
+physx::PxU32 valencyToPxU32(const physx::ConvexHullCooked::Valency& v)
+{
+	return merge(v.mCount, v.mOffset); 
+}
+
+__host__ __device__ inline
+physx::PxU8 getNbAdjVerts(physx::PxU32 val)
+{
+	return u16Low(u32Low(val));
+}
+
+__host__ __device__ inline
+physx::PxU16 getNbEdges(physx::PxU32 val)
+{
+	return u32High(val);
+}
+
+__host__ __device__ inline
+physx::PxU8 getNbPolygons(physx::PxU32 val)
+{
+	return u16Low(u32Low(val));
+}
+
+__host__ __device__ inline
+physx::PxU16 getVRef8(physx::PxU32 val)
+{
+	return u32High(val);
+}
+
+__host__ __device__ inline
+physx::PxU8 getNbVerts(physx::PxU32 val)
+{
+	return u16High(u32Low(val));
+}
+
+__host__ __device__ inline
+physx::PxU8 getMinIndex(physx::PxU32 val)
+{
+	return u16Low(u32Low(val));
+}
+
+__host__ __device__ inline
+physx::PxU32 merge(physx::PxU16 hi, physx::PxU8 lohi, physx::PxU8 lolo)
+{
+	return merge(hi, merge(lohi, lolo));
+}
+
+__host__ __device__ inline bool isValidTet(const physx::PxVec3& worldV0, const physx::PxVec3& worldV1, const physx::PxVec3& worldV2, const physx::PxVec3& worldV3)
+{
+	return (worldV1 - worldV0).dot((worldV2 - worldV0).cross(worldV3 - worldV0)) > 1e-9f;
+}
+
+//device only functions
+#if PX_CUDA_COMPILER
+
+#include "reduction.cuh"
+
+class MeshScaling
+{
+public:
+	__device__ MeshScaling(const physx::PxVec3& scale_, const physx::PxQuat& rotation_)
+	{
+		scale = scale_;
+		rotation = rotation_;
+		idtMeshScale = (scale.x == 1.0f && scale.y == 1.0f && scale.z == 1.0f);
+		flipNormal = ((scale.x * scale.y * scale.z) < 0.0f);
+	}
+
+	__device__ inline physx::PxVec3 vertex2Shape(const physx::PxVec3& v)	const
+	{
+		using namespace physx;
+
+		PxVec3 temp = rotation.rotate(v);
+
+		temp.x *= scale.x;
+		temp.y *= scale.y;
+		temp.z *= scale.z;
+
+		return rotation.rotateInv(temp);
+	}
+
+	__device__ inline physx::PxVec3 shape2Vertex(const physx::PxVec3& v)	const
+	{
+		using namespace physx;
+
+		PxVec3 temp = rotation.rotate(v);
+
+		temp.x /= scale.x;
+		temp.y /= scale.y;
+		temp.z /= scale.z;
+
+		return rotation.rotateInv(temp);
+	}
+
+	//Transforms a normal vector from vertex to shape space. This keeps the normal vector perpendicular to surfaces that get scaled with the same transform.
+	//Does not preserve length. Is applicable to other covariant vectors too.
+	__device__ inline physx::PxVec3 vertex2ShapeNormalVector(const physx::PxVec3& normal)	const
+	{
+		return shape2Vertex(normal);
+	}
+
+	//Transforms a normal vector from shape to vertex space. This keeps the normal vector perpendicular to surfaces that get scaled with the same transform.
+	//Does not preserve length. Is applicable to other covariant vectors too.
+	__device__ inline physx::PxVec3 shape2VertexNormalVector(const physx::PxVec3& normal)	const
+	{
+		return vertex2Shape(normal);
+	}
+
+	__device__ inline void getShapeSpaceVert(physx::PxVec3& triV0, physx::PxVec3& triV1, physx::PxVec3& triV2,
+		physx::PxVec3& v0, physx::PxVec3& v1, physx::PxVec3& v2)	const
+	{
+		if (idtMeshScale)
+		{
+			v0 = triV0;
+			v1 = triV1;
+			v2 = triV2;
+		}
+		else
+		{
+			// Triangle scaling, triangle verts in shape space
+			triV0 = vertex2Shape(triV0);
+			triV1 = vertex2Shape(triV1);
+			triV2 = vertex2Shape(triV2);
+			v0 = triV0;
+			v1 = flipNormal ? triV2 : triV1;
+			v2 = flipNormal ? triV1 : triV2;
+		}
+	}
+
+	physx::PxVec3 scale;
+	physx::PxQuat rotation;
+	bool idtMeshScale;
+	bool flipNormal;
+};
+
+//Applies a potentially non-uniform scaling to the point v. The scaling can be expressed in a rotated coordinate frame defined by the quaternion called rotation.
+__device__ inline
+physx::PxVec3 vertex2Shape(const physx::PxVec3& v, const physx::PxVec3& scale, const physx::PxQuat& rotation)
+{
+	using namespace physx;
+
+	PxVec3 temp = rotation.rotate(v);
+
+	temp.x *= scale.x; 
+	temp.y *= scale.y;
+	temp.z *= scale.z; 
+	
+	return rotation.rotateInv(temp);
+}
+
+//Removes a potentially non-uniform scaling from the point v. The scaling can be expressed in a rotated coordinate frame defined by the quaternion called rotation.
+__device__ inline
+physx::PxVec3 shape2Vertex(const physx::PxVec3& v, const physx::PxVec3& scale, const physx::PxQuat& rotation)
+{
+	using namespace physx;
+
+	PxVec3 temp = rotation.rotate(v);
+
+	temp.x /= scale.x; 
+	temp.y /= scale.y;
+	temp.z /= scale.z; 
+	
+	return rotation.rotateInv(temp);
+}
+
+//Transforms a normal vector from vertex to shape space. This keeps the normal vector perpendicular to surfaces that get scaled with the same transform.
+//Does not preserve length. Is applicable to other covariant vectors too.
+__device__ inline
+physx::PxVec3 vertex2ShapeNormalVector(const physx::PxVec3& normal, const physx::PxVec3& scale, const physx::PxQuat& rotation)
+{
+	return shape2Vertex(normal, scale, rotation);
+}
+
+//Transforms a normal vector from shape to vertex space. This keeps the normal vector perpendicular to surfaces that get scaled with the same transform.
+//Does not preserve length. Is applicable to other covariant vectors too.
+__device__ inline
+physx::PxVec3 shape2VertexNormalVector(const physx::PxVec3& normal, const physx::PxVec3& scale, const physx::PxQuat& rotation)
+{
+	return vertex2Shape(normal, scale, rotation);
+}
+
+__device__ inline static
+void prepareVertices(const PxTransform& transf,
+	PxVec3 scale, PxQuat rot,
+	PxU32 nbHullVertices,
+	const float4* pVertices,
+	PxVec3* s_vertices
+)
+{
+	assert(transf.isFinite());
+	for (PxU32 i = threadIdx.x; i < nbHullVertices; i += WARP_SIZE)
+	{
+		float4 f = pVertices[i];
+		s_vertices[i] = transf.transform(vertex2Shape(PxVec3(f.x, f.y, f.z), scale, rot));
+		assert(s_vertices[i].isFinite());
+	}
+}
+
+__device__ inline static
+void prepareVertices(PxVec3 scale, PxQuat rot,
+	PxU32 nbHullVertices,
+	const float4* pVertices,
+	PxVec3* s_vertices
+)
+{
+	for (PxU32 i = threadIdx.x; i < nbHullVertices; i += WARP_SIZE)
+	{
+		float4 f = pVertices[i];
+		s_vertices[i] = vertex2Shape(PxVec3(f.x, f.y, f.z), scale, rot);
+	}
+}
+
+#define MESH_MANIFOLD_EPSILON   0.05f
+#define BOX_MARGIN_RATIO		0.25f
+
+__device__ inline static
+PxReal calculatePCMConvexMargin(const float4& extents_, const PxVec3& scale, const PxReal toleranceLength)
+{
+	using namespace physx;
+	const PxVec3 extents = PxVec3(extents_.x * scale.x, 
+								 extents_.y * scale.y,
+								 extents_.z * scale.z);
+
+	const PxReal min = fminf(fminf(extents.x, extents.y), extents.z);
+	const PxReal toleranceMargin = toleranceLength * MESH_MANIFOLD_EPSILON;
+	//ML: 25% of the minimum extents of the internal AABB as this convex hull's margin
+	return fminf((min * BOX_MARGIN_RATIO), toleranceMargin);
+}
+
+__device__ inline static
+PxReal calculatePCMConvexMargin(const PxVec3& extents, const PxReal toleranceLength)
+{
+	using namespace physx;
+	const PxReal min = fminf(fminf(extents.x, extents.y), extents.z);
+	const PxReal toleranceMargin = toleranceLength * MESH_MANIFOLD_EPSILON;
+	//ML: 25% of the minimum extents of the internal AABB as this convex hull's margin
+	return fminf((min * BOX_MARGIN_RATIO), toleranceMargin);
+}
+
+__device__ inline static physx::PxReal maxTransformPositionDelta(const physx::PxVec3& curP, const physx::PxVec3& preP)
+{
+	using namespace physx;
+	const PxVec3 deltaP = curP - preP;
+	return PxMax(PxAbs(deltaP.x), PxMax(PxAbs(deltaP.y), PxAbs(deltaP.z)));
+}
+
+
+__constant__ __device__ PxF32 local_invalidateThresholdsConvex[5] = {	0.5f, 0.125f, 0.25f, 0.375f, 0.375f	};
+__constant__ __device__ PxF32 local_invalidateQuatThresholdsConvex[5] = {	0.9998f, 0.9999f, 0.9999f, 0.9999f, 0.9999f	};
+
+__constant__ __device__ PxF32 local_invalidateThresholdsSphere[3] = { 0.5f, 0.1f, 0.75f };
+__constant__ __device__ PxF32 local_invalidateQuatThresholdsSphere[3] = { 0.9995f, 0.9999f, 0.9997f };
+
+__constant__ __device__ PxF32 local_invalidateQuatThresholdsConvexPlane[5] = { 0.99996f, 0.99996f, 0.99996f, 0.99996f, 0.99996f };
+	
+__device__ inline static
+PxU32 invalidate_BoxConvex(const PxVec3& curRelPos, const PxQuat& curQuatA, const PxQuat& curQuatB, 
+	const PxVec3& preRelPos, const PxQuat& preRelQuatA, const PxQuat& preRelQuatB,
+	const physx::PxReal minMargin, const physx::PxReal radiusA, const physx::PxReal radiusB, 
+	const physx::PxU8 manifold_NumContacts, PxF32* local_invalidateThresholds, PxF32* local_invalidateQuatThresholds)
+{
+	using namespace physx;
+	//This is the translational threshold used in invalidate_BoxConvexHull. 0.5 is 50% of the object margin. we use different threshold between
+	//0 and 4 points. This threashold is a scale that is multiplied by the objects' margins.
+	const PxReal ratio = local_invalidateThresholds[manifold_NumContacts];
+	
+	const PxReal thresholdP = minMargin * ratio;
+	const PxReal deltaP = maxTransformPositionDelta(curRelPos, preRelPos);
+
+	//This is the rotational threshold used in invalidate_BoxConvexHull. 0.9998 is a threshold for quat difference
+	//between previous and current frame
+	const PxReal thresholdQ = local_invalidateQuatThresholds[manifold_NumContacts];
+	const PxReal deltaQA = curQuatA.dot(preRelQuatA);
+	const PxReal deltaQB = curQuatB.dot(preRelQuatB);
+
+	bool generateContacts = (deltaP > thresholdP) || (thresholdQ > deltaQA) || (thresholdQ > deltaQB);
+	if (!generateContacts)
+	{
+		const PxReal aRadian = deltaQA < 1.0f ? acosf(deltaQA) : 0.f;
+		const PxReal bRadian = deltaQB < 1.0f ? acosf(deltaQB) : 0.f;
+
+		const PxReal travelDistA = aRadian * radiusA;
+		const PxReal travelDistB = bRadian * radiusB;
+
+		generateContacts = (travelDistA > thresholdP) || (travelDistB > thresholdP);
+	}
+	return generateContacts;
+}
+
+
+#endif
+#endif
--- a/engine/third_party/physx/source/gpunarrowphase/include/schlockShared.h
+++ b/engine/third_party/physx/source/gpunarrowphase/include/schlockShared.h
@@ -0,0 +1,230 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef GU_SCHLOCKSHARED_H
+#define GU_SCHLOCKSHARED_H
+
+#include "foundation/PxVec3.h"
+#include "foundation/PxMat33.h"
+#include "foundation/PxTransform.h"
+#include "geometry/PxMeshScale.h"
+
+#include <assert.h>
+#include <stdio.h>
+
+#define GJK_LOGGING 0
+#define EPA_REPLACE_TRIANGLE_LOGGING 0
+#define EPA_LOGGING 0
+#define EPA_LOG_DEGENERATE_CASES 0
+
+#if GJK_LOGGING
+#define GJK_LOG(...) if(gjkDebug) printf(__VA_ARGS__)
+#else
+#define GJK_LOG(...)
+#endif
+
+
+namespace schlock
+{
+
+extern bool epaDebug;
+
+using namespace physx;
+
+template<typename T>
+PX_CUDA_CALLABLE PX_FORCE_INLINE void swap(T& a, T& b)
+{
+	T tmp = a;
+	a = b;
+	b = tmp;
+}
+
+PX_FORCE_INLINE PX_CUDA_CALLABLE PxU32 encodeIndices(PxU8 a, PxU8 b)		{ return 0x3f800000u | b<<8 | a;}
+PX_FORCE_INLINE PX_CUDA_CALLABLE PxU8 aIndex(PxU32 indices)					{ return indices&0xff; }
+PX_FORCE_INLINE PX_CUDA_CALLABLE PxU8 bIndex(PxU32 indices)					{ return (indices>>8)&0xff; }
+
+struct CachedVertex
+{
+	PxVec3 v;
+	PxU32 indices;
+	PX_FORCE_INLINE PX_CUDA_CALLABLE CachedVertex() {}
+	PX_FORCE_INLINE PX_CUDA_CALLABLE CachedVertex(PxU8 iA, PxU8 iB, const PxVec3& iV): v(iV), indices(encodeIndices(iA,iB)) {}
+	PX_FORCE_INLINE PX_CUDA_CALLABLE CachedVertex(const CachedVertex& other): v(other.v), indices(other.indices) {}
+	PX_FORCE_INLINE PX_CUDA_CALLABLE void operator=(const CachedVertex& other)	{	v = other.v, indices = other.indices; }
+
+	PX_FORCE_INLINE PX_CUDA_CALLABLE PxU8 a() const { return aIndex(indices); }
+	PX_FORCE_INLINE PX_CUDA_CALLABLE PxU8 b() const { return bIndex(indices); }
+
+	PX_FORCE_INLINE PX_CUDA_CALLABLE PxU8 a() const volatile { return aIndex(indices); }
+	PX_FORCE_INLINE PX_CUDA_CALLABLE PxU8 b() const volatile { return bIndex(indices); }
+	PX_FORCE_INLINE PX_CUDA_CALLABLE PxVec3 getV() const volatile { return PxVec3(v.x, v.y, v.z); }
+};
+
+struct GjkCachedData
+{
+	CachedVertex vertices[4];
+	struct
+	{
+		PxVec3 n;
+		PxReal d;
+	} normals[4];
+
+	PxU32 size;
+
+	PX_FORCE_INLINE PX_CUDA_CALLABLE GjkCachedData(){}
+
+	__device__ void Reset() { size = 0; __syncwarp(); }
+};
+
+struct GjkResult
+{
+	enum Enum
+	{
+		eDISTANT,
+		eCLOSE,
+		eOVERLAP
+	};
+};
+
+struct GjkOutput
+{
+	PxVec3 direction;				// unit witness for lower bound
+	PxReal lowerBound;				// provable lower bound
+	PxReal upperBound;				// provable upper bound
+
+	PxVec3 closestPointA;			// witnesses for upper bound
+	PxVec3 closestPointB;
+
+	bool degenerate;
+	PxVec3 closestPointDir;
+};
+
+PX_CUDA_CALLABLE PX_FORCE_INLINE void barycentrics(const PxVec3& a, const PxVec3& b, const PxVec3& c, float &u, float &v, float &w, const PxVec3& p)
+{
+    PxVec3 m = (b - a).cross(c - a), ma = m.abs(), a1 = (b-p).cross(c-p), a2 = (c-p).cross(a-p);
+    PxReal d;
+
+    // area ratios in plane with largest normal component
+    if (ma.x >= ma.y && ma.x >= ma.z) 
+		d = 1.0f/m.x, u = a1.x * d,  v = a2.x * d;
+    else if (ma.y >= ma.z)
+		d = 1.0f/m.y, u = a1.y * d,  v = a2.y * d;
+	else
+		d = 1.0f/m.z, u = a1.z * d,  v = a2.z * d;
+
+    w = 1.0f - u - v;
+}
+
+
+// top-level functions
+
+// unit-level functions for testing
+static const PxU32 EPA_MAX_VERTS = 18, EPA_MAX_FACES = 2*EPA_MAX_VERTS-4;		// for N verts, we'll have 2(N-2) faces. 
+
+typedef int EPABitMap;
+
+#define MAKE_Vec3I8(A, B, C) (physx::PxU32(A) | (physx::PxU32(B) << 8) | (physx::PxU32(C) << 16))
+
+struct Vec3I8
+{
+	PxU32 a;
+	PX_FORCE_INLINE PX_CUDA_CALLABLE PxU8 get(PxU32 i)	const			{ return (a >> (i << 3)) & 0xFFul; }
+	PX_FORCE_INLINE PX_CUDA_CALLABLE void set(PxU32 i, PxU8 v)			{ a = (a & ~(0xFFul << (i << 3))) | (PxU32(v) << (i << 3)); }
+	PX_FORCE_INLINE PX_CUDA_CALLABLE void setInt(int i)					{ a = i; }
+	PX_FORCE_INLINE PX_CUDA_CALLABLE int getInt()						{ return a; }
+
+	PX_FORCE_INLINE PX_CUDA_CALLABLE const Vec3I8& operator=(const Vec3I8& other)						{a = other.a; return *this;}
+
+	bool operator==(const Vec3I8& other) { return (a & 0xFFffFFul) == (other.a & 0xFFffFFul); }
+};
+
+struct EPATriangle
+{
+	PxVec3 normal;			// unit normal
+	PxReal distance;		// *plane* distance from origin
+	Vec3I8 v;				// vertices
+};
+
+struct EPAHullData
+{
+	PxVec3				v[EPA_MAX_VERTS];
+	PxU32				i[EPA_MAX_VERTS];
+	EPATriangle			t[EPA_MAX_FACES];
+	Vec3I8				a[EPA_MAX_FACES];
+
+	PxU8				nbVertices;
+	EPABitMap			liveMap;
+};
+
+
+// GJK routine. 
+//
+// Input:
+// convexA, convexB: two convexes in the same space
+// initialDir: an unnormalized start direction for GJK iteration
+// minSep: the minimum distance at which the two objects are judged to be separated. Must be positive.
+// convergenceRatio: minimum ratio of lowerBound/upperBound for convergence (a number close to, but less than, 1)
+//
+// returns DISTANT if a lower bound on distance is found that is > minSep
+//         OVERLAP if the objects are provably overlapping
+//         else CLOSE.
+// 
+// for OVERLAP, the cachedData (vertex indices & search directions) can be passed to EPA
+// for DISTANT, a direction is generated which provides a witness for the lower bound
+// for CLOSE, a pair of closest points is generated, which provide (in MS space) a witness for the upper bound, and
+// if nondegenerate, also a direction which witnesses the lower bound
+//
+// the degenerate flag is raised in the output if the algorithm encountered geometric instability while the minSep value is in [lower, upper].
+// In particular, we are not overlapping (lower>minSep) nor overlapping (upper == 0), so this can only accompany a result of CLOSE. 
+// geometric instability can only happen very close to convergence, so unless very high tolerances are necessary, convergence can be assumed.
+// in this case if the lower bound is zero, then the the lower bound witness direction is not generated. 
+// So when CLOSE is returned, the upper bound should be used as the authoritative distance, and the direction should be taken from the closest points
+
+// EPA routine. 
+//
+// Input:
+// convexA, convexB: two convexes in the same space
+// cache: a cache of initial vertices produced by GJK. Must have size at least 2
+// convergenceRatio: minimum ratio of lowerBound/upperBound for convergence (a number close to, but less than, 1)
+//
+// on output, the following are emitted:
+// * a pair of closest points which are the best depenetration found in the allowed space
+// * a lower bound and upper bound on penetration depth.
+// * a direction corresponding to the lower bound (this direction *opposes* closestB - closestA), so if we translate object B by lowerBound * direction, we 
+//   should approximately separate the shapes
+// 
+// the degenerate flag is raised in the output if the algorithm encountered geometric instability before convergence, or ran out of space.
+// in this case, the bounds, direction and closest points are still generated. 
+//
+// The algorithm returns GjkResult::eOVERLAP in almost all cases. However, it's possible for EPA to find separation even if GJK found penetration,
+// and in this anomalous case it returns GjkResult::eCLOSE
+//
+// // There is currently no direction for the upper bound, which would allow for reliable (if not minimal) separation in degenerate cases
+
+}
+#endif
--- a/engine/third_party/physx/source/gpunarrowphase/include/tri32Data.h
+++ b/engine/third_party/physx/source/gpunarrowphase/include/tri32Data.h
@@ -0,0 +1,87 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+
+#ifndef __TRI32DATA_H__
+#define __TRI32DATA_H__
+
+struct Triangle32Data
+{
+	PxU32	v[3];
+
+	Triangle32Data() {}
+
+	Triangle32Data(PxU32 v0,PxU32 v1,PxU32 v2)
+	{
+		v[0]=v0;
+		v[1]=v1;
+		v[2]=v2;
+	}
+};
+
+struct Triangle32DataPad
+{
+	PxU32	v[3];
+	PxU32	pad;
+
+	PX_CUDA_CALLABLE Triangle32DataPad() {}
+
+	PX_CUDA_CALLABLE Triangle32DataPad(PxU32 v0, PxU32 v1, PxU32 v2, PxU32 v3)
+	{
+		v[0] = v0;
+		v[1] = v1;
+		v[2] = v2;
+		pad = v3;
+	}
+
+	PX_CUDA_CALLABLE Triangle32DataPad(PxU32 v0, PxU32 v1, PxU32 v2)
+	{
+		v[0] = v0;
+		v[1] = v1;
+		v[2] = v2;
+		pad = 0;
+	}
+
+	PX_CUDA_CALLABLE Triangle32DataPad(Triangle32Data t)
+	{
+		v[0] = t.v[0];
+		v[1] = t.v[1];
+		v[2] = t.v[2];
+		pad = 0;
+	}
+
+	PX_CUDA_CALLABLE Triangle32DataPad(Triangle32Data t, PxU32 ipad = 0)
+	{
+		v[0] = t.v[0];
+		v[1] = t.v[1];
+		v[2] = t.v[2];
+		pad = ipad;
+	}
+};
+
+#endif
--- a/engine/third_party/physx/source/gpunarrowphase/include/typeHelpers.h
+++ b/engine/third_party/physx/source/gpunarrowphase/include/typeHelpers.h
@@ -0,0 +1,62 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+
+#ifndef __TYPEHELPERS_H__
+#define __TYPEHELPERS_H__
+
+PX_CUDA_CALLABLE static inline PxVec3 float4ToVec3(const float4 & data)
+{
+	return PxVec3(data.x, data.y, data.z);
+}
+PX_CUDA_CALLABLE static inline float4 vec3ToFloat4(const PxVec3 & data)
+{
+	return make_float4(data.x, data.y, data.z, 0.0f);
+}
+
+PX_CUDA_CALLABLE static inline PxQuat float4ToQuat(const float4 & data)
+{
+	return PxQuat(data.x, data.y, data.z, data.w);
+}
+PX_CUDA_CALLABLE static inline float4 quatToFloat4(const PxQuat & data)
+{
+	return make_float4(data.x, data.y, data.z, data.w);
+}
+
+PX_CUDA_CALLABLE static inline void transformToFloat4s(float4 & pos, float4 & rot, const PxTransform & transform)
+{
+	pos = make_float4(transform.p.x, transform.p.y, transform.p.z, 0.0f);
+	rot = make_float4(transform.q.x, transform.q.y, transform.q.z, transform.q.w);
+}
+PX_CUDA_CALLABLE static inline void float4sToTransform(PxTransform & transform, const float4 & pos, const float4 & rot)
+{
+	transform.p = PxVec3(pos.x, pos.y, pos.z);
+	transform.q = PxQuat(rot.x, rot.y, rot.z, rot.w);
+}
+
+#endif
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/bv32Traversal.cuh
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/bv32Traversal.cuh
@@ -0,0 +1,208 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __BV32_TRAVERSAL_CUH__
+#define __BV32_TRAVERSAL_CUH__
+
+#include "copy.cuh"
+#include "reduction.cuh"
+
+//Not used but handy when creating a new type of tree traversal
+struct BV32TraversalTemplate
+{
+	PX_FORCE_INLINE __device__ BV32TraversalTemplate()
+	{
+
+	}
+
+	//Called on leaf nodes to process them
+	//A full warp will call the function. If primitiveIndex is 0xFFFFFFFF, then the thread has no leaf node to check.
+	PX_FORCE_INLINE __device__ void intersectPrimitiveFullWarp(PxU32 primitiveIndex, PxU32 idxInWarp) const
+	{
+		
+	}
+
+	//Called to determine if the traversal should continue or stop on a branch node
+	//A full warp will call the function. If hasBox is false, then the thread has no branch node to check. 
+	PX_FORCE_INLINE __device__ bool intersectBoxFullWarp(bool hasBox, const PxVec3& min, const PxVec3& max) const
+	{
+		if (hasBox)
+			return true; //Actually test against the axis aligned box defined by min and max
+		return false;
+	}
+};
+
+// TW: On my laptop RTX 3080 the FPS of some FEM cloth VTs actually drops by about 10% when enabling optimized traversal...
+#define OPTIMIZE_TRAVERSAL 0
+
+template<typename T, unsigned int WarpsPerBlock>
+__device__ static void bv32TreeTraversal(
+	const physx::Gu::BV32DataPacked* bv32PackedNodes,
+	int* sBv32Nodes,
+	T& callback
+)
+{
+#if OPTIMIZE_TRAVERSAL
+	__shared__ Gu::BV32DataPacked shNode[WarpsPerBlock];
+	const unsigned int warpId = threadIdx.y;
+#endif
+
+	//thread index in warp
+	const unsigned int idxInWarp = threadIdx.x;
+
+	PxU32 currentNodeIndex = 0; // start at the root
+	const Gu::BV32DataPacked* PX_RESTRICT currentNode = &bv32PackedNodes[0];
+
+	PxU32 nbStack = 0; // number of nodes on the stack of to-be-expanded nodes
+
+	while (currentNode)
+	{
+#if OPTIMIZE_TRAVERSAL
+		//This optimization was found in cudaParticleSystem.cu, line 2373
+		// VR: This variant works ~6 times *faster* on my GTX 1080
+		Gu::BV32DataPacked& node = shNode[warpId];
+		warpCopy((uint*)&node, (uint*)currentNode, sizeof(Gu::BV32DataPacked));
+		__syncwarp();
+#else
+		// VR: This variant works ~6 times *slower* on my GTX 1080
+		const Gu::BV32DataPacked& node = *currentNode;
+#endif
+
+		__syncwarp(); //sBv32Nodes (shared memory) is read and written in the same loop - read and write must be separated with a sync
+
+		//check to see whether this is a leaf or not
+		if (currentNodeIndex & 1)
+		{
+			const PxU32 leafOffset = (currentNodeIndex >> 1) & 31;
+			//at max 32 primitives at a leaf node 
+			const PxU32 nbPrimitives = node.getNbReferencedPrimitives(leafOffset);
+			const PxU32 primStartIndex = node.getPrimitiveStartIndex(leafOffset);
+
+			// now every node checks a primitive (triangle, tetrahedron etc) against the query geometry
+			// result variables:
+			PxU32 primitiveIdx = 0xffffffff;
+
+			if (idxInWarp < nbPrimitives)
+			{
+				primitiveIdx = idxInWarp + primStartIndex;
+			}
+			callback.intersectPrimitiveFullWarp(primitiveIdx, idxInWarp);
+		}
+		else // not a leave node
+		{
+			const PxU32 nbChildren = node.mNbNodes;
+
+			// there are max 32 children nodes. Every thread checks whether its child's bounding box
+			// would intersect with the query object (which can be a box, a ray or anything)
+
+			PxVec3 min, max;
+			bool hasBox = idxInWarp < nbChildren;
+			if (hasBox)
+			{
+				min = PxLoad3(reinterpret_cast<const float4&>(node.mMin[idxInWarp]));
+				max = PxLoad3(reinterpret_cast<const float4&>(node.mMax[idxInWarp]));				
+			}
+			bool intersect = callback.intersectBoxFullWarp(hasBox, min, max);
+			//if (!hasBox)
+			//	intersect = false;
+			/*if (workIndex == 0 )
+			{
+			printf("newActiveParticles %x idx %i Finished\n", newActiveParticles, idxInWarp);
+			}*/
+
+			// ML: the first one bit is to decide whether the node is a leaf or not.The subsequence 5 bits is used to
+			//store the row index of mData so that if the node is leaf we can get the the reference triangle. The subsequence 26 bits
+			//is used to store either the next node index if the node isn't a leaf or store the parent node index if the node is leaf node.
+
+			//Masking currentNodeIndex with 0xFFFFFFC0 clears the 6 lowest bits, so we can mask in our idxInWarp<<1 | 1 for leaf masks
+			PxU32 currentIndex = ((currentNodeIndex) & 0xFFFFFFC0) | (idxInWarp << 1) | 1;
+
+			//create node bitmap 
+			bool isValidNode = false;
+			if (idxInWarp < nbChildren)
+			{
+				const PxU32 childOffset = node.getChildOffset(idxInWarp);
+
+				isValidNode = intersect;
+
+				if (isValidNode && !node.isLeaf(idxInWarp))
+				{
+					currentIndex = (childOffset) << 6;
+				}
+			}
+
+			PxU32 resultWarp = __ballot_sync(FULL_MASK, isValidNode);
+			/*if (idxInWarp == 0)
+			{
+			printf("ResultWarp = %x\n", resultWarp);
+			}*/
+
+			//PxU32 resultWarp = __ballot_sync(FULL_MASK, intersect);
+			PxU32 offset = warpScanExclusive(resultWarp, idxInWarp);
+			PxU32 validCount = __popc(resultWarp);
+
+			if (isValidNode)
+			{
+				//printf("Push Bv32Nodes[%i] = %i mask = %x\n", nbStack + offset, currentIndex, newActiveParticles);
+				sBv32Nodes[nbStack + offset] = currentIndex;
+			}
+
+			nbStack += validCount;
+		}
+
+		__syncwarp();
+
+		// select the next node from the stack, same for all warps
+		if (nbStack > 0)
+		{
+			currentNodeIndex = sBv32Nodes[--nbStack];
+			currentNode = &bv32PackedNodes[currentNodeIndex >> 6];
+
+			/*if (idxInWarp == 0)
+			{
+				printf("Pop CurrentIndex = %i, currentNode = %i, currentActiveParticles = %x\n", currentNodeIndex, currentNodeIndex >> 6, currentNodeActiveParticles);
+			}*/
+		}
+
+		else
+			currentNode = NULL;
+	}
+}
+
+PX_FORCE_INLINE __device__ void pushOntoStackFullWarp(bool threadPushesElement, PxU32* atomicCounter, uint4* PX_RESTRICT stackPtr, PxU32 stackSize, const uint4& stackItemToPush)
+{
+	PxU32 stackIndex = globalScanExclusiveSingleWarp(threadPushesElement, atomicCounter);
+
+	// Each thread stores its contact. If we run out of stacksize, drop contacts
+	if (threadPushesElement && (stackIndex < stackSize))
+	{
+		stackPtr[stackIndex] = stackItemToPush;
+	}
+}
+
+#endif
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/capsuleTriangle.cuh
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/capsuleTriangle.cuh
@@ -0,0 +1,476 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __CAPSULE_COLLISION_CUH__
+#define __CAPSULE_COLLISION_CUH__
+
+#include "distanceSegmentSegment.cuh"
+
+#define ETD_CONVEX_EDGE_01  (1 << 3)	// PT: important value, don't change
+#define ETD_CONVEX_EDGE_12  (1 << 4)	// PT: important value, don't change
+#define ETD_CONVEX_EDGE_20  (1 << 5)	// PT: important value, don't change
+
+__device__ __forceinline__ bool selectNormal(const PxReal u,
+	const PxReal v, PxU8 triFlags)
+{
+	const PxReal zero = 1e-6f;
+	const PxReal one = 0.999999f;
+	// Analysis
+	if (zero > u)
+	{
+		if (zero > v)
+		{
+			// Vertex 0
+			if (!(triFlags & (ETD_CONVEX_EDGE_01 | ETD_CONVEX_EDGE_20)))
+				return true;
+		}
+		else if (v > one)
+		{
+			// Vertex 2
+			if (!(triFlags & (ETD_CONVEX_EDGE_12 | ETD_CONVEX_EDGE_20)))
+				return true;
+		}
+		else
+		{
+			// Edge 0-2
+			if (!(triFlags & ETD_CONVEX_EDGE_20))
+				return true;
+		}
+	}
+	else if (u > one)
+	{
+		if (zero > v)
+		{
+			// Vertex 1
+			if (!(triFlags & (ETD_CONVEX_EDGE_01 | ETD_CONVEX_EDGE_12)))
+				return true;
+		}
+	}
+	else
+	{
+		if (zero > v)
+		{
+			// Edge 0-1
+			if (!(triFlags & ETD_CONVEX_EDGE_01))
+				return true;
+		}
+		else
+		{
+			const PxReal threshold = 0.9999f;
+			const PxReal temp = u + v;
+			if (temp >= threshold)
+			{
+				// Edge 1-2
+				if (!(triFlags & ETD_CONVEX_EDGE_12))
+					return true;
+			}
+			else
+			{
+				// Face
+				return true;
+			}
+		}
+	}
+	return false;
+}
+
+__device__ __forceinline__ bool isValidTriangleBarycentricCoord(
+	const PxReal v, const PxReal w)
+{
+	const PxReal eps = 1e-6f;
+	const PxReal zero(-eps);
+	const PxReal one(1.f + eps);
+
+	return (v >= zero && v <= one) && (w >= zero && w <= one) && ((v + w) <= one);
+}
+
+/*
+	t is the barycenteric coordinate of a segment
+	u is the barycenteric coordinate of a triangle
+	v is the barycenteric coordinate of a triangle
+*/
+__device__ __forceinline__ PxReal distanceSegmentTriangleSquared(
+	const PxVec3& p, const PxVec3& q,
+	const PxVec3& a, const PxVec3& b,
+	const PxVec3& c, PxReal& t, PxReal& u,
+	PxReal& v)
+{
+	const PxVec3 pq = q - p;
+	const PxVec3 ab = b - a;
+	const PxVec3 ac = c - a;
+	const PxVec3 bc = c - b;
+	const PxVec3 ap = p - a;
+	const PxVec3 aq = q - a;
+
+	//This is used to calculate the barycentric coordinate
+	const PxReal d00 = ab.dot(ab);
+	const PxReal d01 = ab.dot(ac);
+	const PxReal d11 = ac.dot(ac);
+	const PxReal tDenom = d00 * d11 - d01 * d01;
+
+	const PxReal bdenom = tDenom > 0.f ? 1.f / tDenom : PX_MAX_F32;//  FSel(FIsGrtr(tDenom, zero), FRecip(tDenom), zero);
+
+	const PxVec3 n = (ab.cross(ac)).getNormalized();// V3Normalize(V3Cross(ab, ac)); // normalize vector
+
+	//compute the closest point of p and triangle plane abc
+	const PxReal dist3 = ap.dot(n);// V3Dot(ap, n);
+	const PxReal sqDist3 = dist3 * dist3;
+
+	//compute the closest point of q and triangle plane abc
+	const PxReal dist4 = aq.dot(n);// V3Dot(aq, n);
+	const PxReal sqDist4 = dist4 * dist4;// FMul(dist4, dist4);
+	const PxReal dMul = dist3 * dist4;//FMul(dist3, dist4);
+	//const BoolV con = FIsGrtr(zero, dMul);
+
+	// intersect with the plane
+	if (dMul <= 0.f)
+	{
+		//compute the intersect point
+		const PxReal nom = -n.dot(ap); // FNeg(V3Dot(n, ap));
+		const PxReal denom = 1.f / n.dot(pq);// FRecip(V3Dot(n, pq));
+		const PxReal t0 = nom * denom; // FMul(nom, denom);
+		const PxVec3 ip = p + pq * t0;// V3ScaleAdd(pq, t, p);//V3Add(p, V3Scale(pq, t));
+		const PxVec3 v2 = ip - a;// V3Sub(ip, a);
+		const PxReal d20 = v2.dot(ab);// V3Dot(v2, ab);
+		const PxReal d21 = v2.dot(ac);// V3Dot(v2, ac);
+		const PxReal v00 = (d11 * d20 - d01 * d21) * bdenom; //FMul(FSub(FMul(d11, d20), FMul(d01, d21)), bdenom);
+		const PxReal w00 = (d00 * d21 - d01 * d20) * bdenom; //FMul(FSub(FMul(d00, d21), FMul(d01, d20)), bdenom);
+		if (isValidTriangleBarycentricCoord(v00, w00))
+		{
+			t = t0;
+			u = v00;
+			v = w00;
+			return 0.f;
+		}
+	}
+
+	const PxVec3 closestP31 = p - n * dist3;// V3NegScaleSub(n, dist3, p);//V3Sub(p, V3Scale(n, dist3));
+	//const PxVec3 closestP30 = p;
+
+	//Compute the barycentric coordinate for project point of q
+	const PxVec3 pV20 = closestP31 - a;
+	const PxReal pD20 = pV20.dot(ab);
+	const PxReal pD21 = pV20.dot(ac);
+	const PxReal v0 = (d11 * pD20 - d01 * pD21) * bdenom;//   FMul(FSub(FMul(d11, pD20), FMul(d01, pD21)), bdenom);
+	const PxReal w0 = (d00 * pD21 - d01 * pD20) * bdenom;// FMul(FSub(FMul(d00, pD21), FMul(d01, pD20)), bdenom);
+
+	//check closestP3 is inside the triangle
+	const bool con0 = isValidTriangleBarycentricCoord(v0, w0);
+
+	const PxVec3 closestP41 = q - n * dist4;// V3NegScaleSub(n, dist4, q);// V3Sub(q, V3Scale(n, dist4));
+	//const PxVec3 closestP40 = q;
+
+	//Compute the barycentric coordinate for project point of q
+	const PxVec3 qV20 = closestP41 - a;
+	const PxReal qD20 = qV20.dot(ab);
+	const PxReal qD21 = qV20.dot(ac);
+	const PxReal v1 = (d11 * qD20 - d01 * qD21) * bdenom;// FMul(FSub(FMul(d11, qD20), FMul(d01, qD21)), bdenom);
+	const PxReal w1 = (d00 * qD21 - d01 * qD20) * bdenom;// FMul(FSub(FMul(d00, qD21), FMul(d01, qD20)), bdenom);
+
+	const bool con1 = isValidTriangleBarycentricCoord(v1, w1);
+
+	if (con0 && con1)
+	{
+		/*
+			both p and q project points are interior point
+		*/
+		if (sqDist4 > sqDist3)
+		{
+			t = 0.f;
+			u = v0;
+			v = w0;
+
+			return sqDist3;
+		}
+		else
+		{
+			t = 1.f;
+			u = v1;
+			v = w1;
+
+			return sqDist4;
+		}
+	}
+	else
+	{
+		PxReal t00, t01, t02;
+		PxReal t10, t11, t12;
+
+		distanceSegmentSegmentSquared(p, pq, a, ab, t00, t10);
+		distanceSegmentSegmentSquared(p, pq, b, bc, t01, t11);
+		distanceSegmentSegmentSquared(p, pq, a, ac, t02, t12);
+
+		const PxVec3 closestP00 = p + pq * t00;// V3ScaleAdd(pq, t00, p);
+		const PxVec3 closestP01 = a + ab * t10;// V3ScaleAdd(ab, t01, a);
+		PxVec3 tempV = closestP00 - closestP01;
+		const PxReal sqDist0 = tempV.dot(tempV);
+
+		const PxVec3 closestP10 = p + pq * t01;// V3ScaleAdd(pq, t10, p);
+		const PxVec3 closestP11 = b + bc * t11;// V3ScaleAdd(bc, t11, b);
+		tempV = closestP10 - closestP11;
+		const PxReal sqDist1 = tempV.dot(tempV);
+
+		const PxVec3 closestP20 = p + pq * t02;// V3ScaleAdd(pq, t20, p);
+		const PxVec3 closestP21 = a + ac * t12;// V3ScaleAdd(ac, t21, a);
+		tempV = closestP20 - closestP21;
+		const PxReal sqDist2 = tempV.dot(tempV);
+
+		//edge ab
+		const PxReal u10 = t10;
+		const PxReal v10 = 0.f;
+
+		//edge bc
+		const PxReal u11 = 1.f - t11;
+		const PxReal v11 = t11;
+
+		//edge ac
+		const PxReal u12 = 0.f;
+		const PxReal v12 = t12;
+
+		PxReal sqDistPE;
+		PxReal uEdge, vEdge, tSeg;
+		if (sqDist1 > sqDist0 && sqDist2 > sqDist0)
+		{
+			//edge ab
+			sqDistPE = sqDist0;
+			uEdge = u10;
+			vEdge = v10;
+			tSeg = t00;
+		}
+		else if (sqDist2 > sqDist1)
+		{
+			//edge bc
+			sqDistPE = sqDist1;
+			uEdge = u11;
+			vEdge = v11;
+			tSeg = t01;
+		}
+		else
+		{
+			//edge ac
+			sqDistPE = sqDist2;
+			uEdge = u12;
+			vEdge = v12;
+			tSeg = t02;
+		}
+
+		if (con0)
+		{
+			//p's project point is an interior point
+			if (sqDistPE > sqDist3)
+			{
+				t = 0.f;
+				u = v0;
+				v = w0;
+				return sqDist3;
+			}
+			else
+			{
+				t = tSeg;
+				u = uEdge;
+				v = vEdge;
+				return sqDistPE;
+			}
+		}
+		else if (con1)
+		{
+			//q's project point is an interior point
+
+			if (sqDistPE > sqDist4)
+			{
+				t = 1.f;
+				u = v1;
+				v = w1;
+				return sqDist4;
+			}
+			else
+			{
+				t = tSeg;
+				u = uEdge;
+				v = vEdge;
+				return sqDistPE;
+			}
+		}
+		else
+		{
+			t = tSeg;
+			u = uEdge;
+			v = vEdge;
+			return sqDistPE;
+		}
+	}
+}
+
+__device__ __forceinline__ PxU32 generateContacts(
+	const PxVec3& a, const PxVec3& b,
+	const PxVec3& c, const PxVec3& planeNormal,
+	const PxVec3& normal, const PxVec3& p,
+	const PxVec3& q, const PxReal inflatedRadius,
+	PxVec3* outContacts, PxReal* outPen)
+{
+	const PxVec3 ab = b - a;
+	const PxVec3 ac = c - a;
+	const PxVec3 ap = p - a;
+	const PxVec3 aq = q - a;
+
+	//This is used to calculate the barycentric coordinate
+	const PxReal d00 = ab.dot(ab);// V3Dot(ab, ab);
+	const PxReal d01 = ab.dot(ac);// V3Dot(ab, ac);
+	const PxReal d11 = ac.dot(ac);// V3Dot(ac, ac);
+	const PxReal tdenom = (d00 * d11 - d01 * d01);
+	const PxReal bdenom = (tdenom > 0.f) ? (1.f / tdenom) : PX_MAX_F32;// FRecip(FSub(FMul(d00, d11), FMul(d01, d01)));
+
+	//compute the intersect point of p and triangle plane abc
+	const PxReal inomp = planeNormal.dot(-ap);// V3Dot(planeNormal, V3Neg(ap));
+	const PxReal ideom = planeNormal.dot(normal);// V3Dot(planeNormal, normal);
+
+	const PxReal ipt = ideom > 0.f ? -(inomp / ideom) : PX_MAX_F32;// FSel(FIsGrtr(ideom, FZero()), FDiv(inomp, ideom), FZero());
+
+	const PxVec3 closestP31 = p - normal * ipt;// V3ScaleAdd(normal, ipt, p);
+	//const PxVec3 closestP30 = p;
+
+	//Compute the barycentric coordinate for project point of q
+	const PxVec3 pV20 = closestP31 - a;
+	const PxReal pD20 = pV20.dot(ab);
+	const PxReal pD21 = pV20.dot(ac);
+	const PxReal v0 = (d11 * pD20 - d01 * pD21) * bdenom;// FMul(FSub(FMul(d11, pD20), FMul(d01, pD21)), bdenom);
+	const PxReal w0 = (d00 * pD21 - d01 * pD20) * bdenom;// FMul(FSub(FMul(d00, pD21), FMul(d01, pD20)), bdenom);
+
+	//check closestP3 is inside the triangle
+	const bool con0 = isValidTriangleBarycentricCoord(v0, w0);
+
+	//printf("v0 %f, w0 %f\n", v0, w0);
+
+	PxU32 numContacts = 0;
+
+	if (con0 && (inflatedRadius > ipt))
+	{
+		outContacts[numContacts] = p;
+		outPen[numContacts] = ipt;
+		numContacts++;
+
+		//printf("con0 %i inflatedRadius %f dist %f pen %f\n", con0, inflatedRadius, dist3, -ipt);
+	}
+
+	const PxReal inomq = planeNormal.dot(-aq);
+
+	const PxReal iqt = ideom > 0.f ? -(inomq / ideom) : PX_MAX_F32;//  FSel(FIsGrtr(ideom, FZero()), FDiv(inomq, ideom), FZero());
+
+	const PxVec3 closestP41 = q - normal * iqt;// V3ScaleAdd(normal, iqt, q);
+	//const PxVec3 closestP40 = q;
+
+	//Compute the barycentric coordinate for project point of q
+	const PxVec3 qV20 = closestP41 - a;// V3Sub(closestP41, a);
+	const PxReal qD20 = qV20.dot(ab);// V3Dot(qV20, ab);
+	const PxReal qD21 = qV20.dot(ac);// V3Dot(qV20, ac);
+	const PxReal v1 = (d11 * qD20 - d01 * qD21) * bdenom;// FMul(FSub(FMul(d11, qD20), FMul(d01, qD21)), bdenom);
+	const PxReal w1 = (d00 * qD21 - d01 * qD20) * bdenom;// FMul(FSub(FMul(d00, qD21), FMul(d01, qD20)), bdenom);
+
+	const bool con1 = isValidTriangleBarycentricCoord(v1, w1);
+
+	if (con1 && (inflatedRadius > iqt))
+	{
+		outContacts[numContacts] = q;
+		outPen[numContacts] = iqt;
+		numContacts++;
+
+		//printf("con1 %i inflatedRadius %f dist %f pen %f\n", con1, inflatedRadius, dist4, -iqt);
+	}
+
+	return numContacts;
+}
+
+__device__ __forceinline__ void generateEE(const PxVec3& p, const PxVec3& q, const PxReal sqInflatedRadius,
+	const PxVec3& normal, const PxVec3& a, const PxVec3& b,
+	PxVec3* outContacts, PxReal* outPen, PxU32& numContacts)
+{
+	const PxVec3 ab = b - a;
+	const PxVec3 n = ab.cross(normal);// V3Cross(ab, normal);
+	const PxReal d = n.dot(a);// V3Dot(n, a);
+	const PxReal np = n.dot(p);// V3Dot(n, p);
+	const PxReal nq = n.dot(q);// V3Dot(n, q);
+	const PxReal signP = np - d;// FSub(np, d);
+	const PxReal signQ = nq - d;
+	const PxReal temp = signP * signQ;
+	if (temp > 0.f) return;//If both points in the same side as the plane, no intersect points
+
+	// if colliding edge (p3,p4) and plane are parallel return no collision
+	const PxVec3 pq = q - p;
+	const PxReal npq = n.dot(pq);// V3Dot(n, pq);
+	if (npq == 0.f) return;
+
+	//calculate the intersect point in the segment pq
+	const PxReal segTValue = (d - np) / npq;//FDiv(FSub(d, np), npq);
+	const PxVec3 localPointA = p + pq * segTValue;// V3ScaleAdd(pq, segTValue, p);
+
+	//printf("edge segTValue %f p (%f, %f, %f) pq(%f, %f, %f)\n", segTValue, p.x, p.y, p.z,
+	//	pq.x, pq.y, pq.z);
+
+	//calculate a normal perpendicular to ray localPointA + normal, 2D segment segment intersection
+	const PxVec3 perNormal = normal.cross(pq);// V3Cross(normal, pq);
+	const PxVec3 ap = localPointA - a;// V3Sub(localPointA, a);
+	const PxReal nom = perNormal.dot(ap);// V3Dot(perNormal, ap);
+	const PxReal denom = perNormal.dot(ab);// V3Dot(perNormal, ab);
+
+	//const FloatV tValue = FClamp(FDiv(nom, denom), zero, FOne());
+	const PxReal tValue = nom / denom;// FDiv(nom, denom);
+	//printf("edge segTValue %f tValue %f\n", segTValue, tValue);
+	//const bool con = 1.0 >= tValue && tValue >= 0.f;// BAnd(FIsGrtrOrEq(one, tValue), FIsGrtrOrEq(tValue, zero));
+	if (tValue > 1.f || tValue < 0.f)
+		return;
+
+	//const Vec3V localPointB = V3ScaleAdd(ab, tValue, a); v = V3Sub(localPointA, localPointB); v =  V3NegScaleSub(ab, tValue, tap)
+	const PxVec3 v = ap - ab * tValue;// V3NegScaleSub(ab, tValue, ap);
+	const PxReal sqDist = v.dot(v); // V3Dot(v, v);
+
+	//printf("edge sqDist %f sqInflatedRadius %f\n", sqDist, sqInflatedRadius);
+
+	if (sqInflatedRadius > sqDist)
+	{
+		const PxReal signedDist = v.dot(normal);
+		//printf("edge signedDist %f localPointA(%f, %f, %f)\n", signedDist, localPointA.x, localPointA.y, localPointA.z);
+		outContacts[numContacts] = localPointA;
+		outPen[numContacts] = signedDist;
+		numContacts++;
+	}
+}
+
+__device__ __forceinline__ void generateEEContacts(const PxVec3& a, const PxVec3& b,
+	const PxVec3& c, const PxVec3& normal,
+	const PxVec3& p, const PxVec3& q, const PxReal sqInflatedRadius,
+	PxVec3* outContacts, PxReal* outPens, PxU32& numContacts)
+{
+	if (numContacts < 2)
+		generateEE(p, q, sqInflatedRadius, normal, a, b, outContacts, outPens, numContacts);
+	if (numContacts < 2)
+		generateEE(p, q, sqInflatedRadius, normal, b, c, outContacts, outPens, numContacts);
+	if (numContacts < 2)
+		generateEE(p, q, sqInflatedRadius, normal, a, c, outContacts, outPens, numContacts);
+}
+
+#endif
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/compressOutputContacts.cu
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/compressOutputContacts.cu
@@ -0,0 +1,292 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <assert.h>
+
+//#include "foundation/PxVec3.h"
+//#include "foundation/PxTransform.h"
+#include "PxgContactManager.h"
+#include "PxsContactManagerState.h"
+#include "PxContact.h"
+#include "PxgNpKernelIndices.h"
+#include "PxgCommonDefines.h"
+#include "reduction.cuh"
+#include "stdio.h"
+#include "PxNodeIndex.h"
+
+using namespace physx;
+
+extern "C" __host__ void initNarrowphaseKernels17() {}
+
+extern "C" __global__ void compressContactStage1(
+	PxsContactManagerOutput*	outputData,					//input
+	PxU32						numTotalPairs,				//input
+	PxU32*						gBlockNumPairs				//output
+
+	)
+{
+	const PxU32 warpPerBlock = PxgNarrowPhaseBlockDims::COMPRESS_CONTACT / WARP_SIZE;
+
+	__shared__ PxU32 sWarpAccum[warpPerBlock];
+	__shared__ PxU32 sAccum;
+	
+	const PxU32 block_size = PxgNarrowPhaseGridDims::COMPRESS_CONTACT;
+
+	const PxU32 totalBlockRequired = (numTotalPairs + (blockDim.x - 1)) / blockDim.x;
+
+	const PxU32 numIterationPerBlock = (totalBlockRequired + (block_size - 1)) / block_size;
+
+	const PxU32 idx = threadIdx.x;
+
+	const PxU32 threadIndexInWarp = threadIdx.x&(WARP_SIZE - 1);
+	const PxU32 warpIndex = threadIdx.x / WARP_SIZE;
+
+	if (idx == (WARP_SIZE - 1))
+	{
+		sAccum = 0;
+	}
+
+	__syncthreads();
+
+	for (PxU32 i = 0; i < numIterationPerBlock; ++i)
+	{
+		const PxU32 workIndex = i * blockDim.x + idx + numIterationPerBlock * blockIdx.x * blockDim.x;
+
+		PxU32 nbContacts = 0;
+
+		if (workIndex < numTotalPairs)
+		{
+			nbContacts = outputData[workIndex].nbContacts;
+		}
+
+		//we don't care about how many contacts in the header. There either have contacts or doesn't
+		//has cotacts
+		const PxU32 contactAccum = __popc(__ballot_sync(FULL_MASK, nbContacts));
+
+		if (threadIndexInWarp == (WARP_SIZE - 1))
+		{
+			sWarpAccum[warpIndex] = contactAccum;
+			//printf("blockIdx %i warpIndex %i contactAccum %i\n", blockIdx.x, warpIndex, contactAccum);
+		}
+
+		__syncthreads();
+
+		unsigned mask_idx = __ballot_sync(FULL_MASK, threadIndexInWarp < warpPerBlock);
+		if (idx < warpPerBlock)
+		{
+			const PxU32 value = sWarpAccum[idx];
+
+			const PxU32 res = warpReduction<AddOpPxU32, PxU32>(mask_idx, value);
+
+			if (idx == (warpPerBlock - 1))
+			{
+				sAccum += res;
+			}
+		}
+
+		__syncthreads();
+
+	}
+
+	if (idx == (warpPerBlock - 1))
+	{
+		gBlockNumPairs[blockIdx.x] = sAccum;
+
+		//printf("blockIdx.x %i numPairs %i \n", blockIdx.x, sAccum);
+	}
+}
+
+//PxgNarrowPhaseGridDims::COMPRESS_CONTACT is 32, which means we can use one warp to do the run sum for
+//block
+extern "C" __global__ void compressContactStage2(
+	const PxgContactManagerInput*	inputData,						//input
+	const PxsContactManagerOutput*	outputData,						//input
+	const PxU32						numTotalPairs,					//input
+	PxNodeIndex*					shapeToRigidRemapTable,			//input
+	PxActor**						transformCacheIdToActorTable,	//input
+	PxU32*							gBlockNumPairs,					//input
+	PxU8*							cpuCompressedPatchesBase,		//input
+	PxU8*							cpuCompressedContactsBase,		//input
+	PxReal*							cpuForceBufferBase,				//input
+	PxU8*							gpuCompressedPatchesBase,		//input
+	PxU8*							gpuCompressedContactsBase,		//input
+	PxReal*							gpuForceBufferBase,				//input
+	PxU8*							gpuFrictionPatchesBase,			//input
+	PxU32*							numPairs,						//input
+	PxGpuContactPair*				outputPatchs,					//output
+	const PxU32						maxOutputPatches				//input
+	
+)
+{
+	const PxU32 warpPerBlock = PxgNarrowPhaseBlockDims::COMPRESS_CONTACT / WARP_SIZE;
+	const PxU32 block_size = PxgNarrowPhaseGridDims::COMPRESS_CONTACT;
+
+	
+	__shared__ PxU32 sWarpAccum[warpPerBlock];
+	__shared__ PxU32 sBlockHistogram[block_size];
+	__shared__ PxU32 sAccum;
+
+	const PxU32 idx = threadIdx.x;
+
+	const PxU32 threadIndexInWarp = threadIdx.x&(WARP_SIZE - 1);
+	const PxU32 warpIndex = threadIdx.x / WARP_SIZE;
+
+	if (idx == (WARP_SIZE - 1))
+	{
+		sAccum = 0;
+	}
+
+	//accumulate num pairs per block and compute exclusive run sum
+	unsigned mask_idx = __ballot_sync(FULL_MASK, threadIndexInWarp < block_size);
+	if (warpIndex == 0 && threadIndexInWarp < block_size)
+	{
+		const PxU32 blockNumPairs = gBlockNumPairs[threadIndexInWarp];
+
+		const PxU32 result = warpScan<AddOpPxU32, PxU32>(mask_idx, blockNumPairs);
+		//store exclusive run sum
+		sBlockHistogram[threadIndexInWarp] = result - blockNumPairs;
+
+		//printf("tInd %i blockNumPairs %i result %i\n", threadIndexInWarp, blockNumPairs, result);
+
+		if (threadIndexInWarp == block_size - 1)
+		{
+			(*numPairs) = result;
+			//printf("blockIdx %i blockNumPairs %i numPairs %i\n", blockIdx.x, blockNumPairs, result);
+		}
+		
+	}
+
+
+	__syncthreads();
+
+	const PxU32 totalBlockRequired = (numTotalPairs + (blockDim.x - 1)) / blockDim.x;
+
+	const PxU32 numIterationPerBlock = (totalBlockRequired + (block_size - 1)) / block_size;
+
+	const PxU32 blockStartIndex = sBlockHistogram[blockIdx.x];
+
+	for (PxU32 i = 0; i < numIterationPerBlock; ++i)
+	{
+		const PxU32 workIndex = i * blockDim.x + idx + numIterationPerBlock * blockIdx.x * blockDim.x;
+
+		PxU32 nbContacts = 0;
+		
+
+		if (workIndex < numTotalPairs)
+		{
+			nbContacts = outputData[workIndex].nbContacts;
+		}
+
+		const PxU32 threadMask = (1 << threadIndexInWarp) - 1;
+
+		//we don't care about how many contacts in the header. There either have contacts or doesn't
+		//has contacts
+
+		const PxU32 contactMask = __ballot_sync(FULL_MASK, nbContacts);
+		const PxU32 contactAccum = __popc(contactMask);
+		const PxU32 offset = __popc(contactMask & threadMask);
+		if (threadIndexInWarp == (WARP_SIZE - 1))
+		{
+			sWarpAccum[warpIndex] = contactAccum;
+		}
+
+		const PxU32 prevAccum = sAccum;
+
+		__syncthreads();
+
+		unsigned mask_idx = __ballot_sync(FULL_MASK, threadIndexInWarp < warpPerBlock);
+		if (idx < warpPerBlock)
+		{
+			const PxU32 value = sWarpAccum[idx];
+
+			const PxU32 res = warpScan<AddOpPxU32, PxU32>(mask_idx, value);
+
+			sWarpAccum[idx] = res - value;
+
+			if (idx == warpPerBlock - 1)
+			{
+				sAccum += res;
+			}
+		}
+
+		__syncthreads();
+
+		//write to output
+		if (nbContacts > 0)
+		{
+			const PxgContactManagerInput& cmInput = inputData[workIndex];
+			const PxsContactManagerOutput& cmOutput = outputData[workIndex];
+			const PxU32 index = offset + prevAccum + sWarpAccum[warpIndex] + blockStartIndex;
+			
+			if (index < maxOutputPatches)
+			{
+				PxU32 patchStartIndex = cmOutput.contactPatches - cpuCompressedPatchesBase;
+				PxU32 contactIndex = cmOutput.contactPoints - cpuCompressedContactsBase;
+				PxU32 forceIndex = 0xFFFFFFFF;
+				if (cmOutput.contactForces)
+					forceIndex = cmOutput.contactForces - cpuForceBufferBase;
+
+				PxGpuContactPair & contact = outputPatchs[index];
+				
+				const PxU32 transformCacheRef0 = cmInput.transformCacheRef0;
+				const PxU32 transformCacheRef1 = cmInput.transformCacheRef1;
+				contact.transformCacheRef0 = transformCacheRef0;
+				contact.transformCacheRef1 = transformCacheRef1;
+				contact.nodeIndex0 = shapeToRigidRemapTable[transformCacheRef0];
+				contact.nodeIndex1 = shapeToRigidRemapTable[transformCacheRef1];
+				contact.contactForces = gpuForceBufferBase + forceIndex;
+				contact.frictionPatches = gpuFrictionPatchesBase + patchStartIndex / sizeof(PxContactPatch) * sizeof(PxFrictionPatch);
+				contact.contactPatches = gpuCompressedPatchesBase + patchStartIndex;
+				contact.contactPoints = gpuCompressedContactsBase + contactIndex;
+				contact.nbPatches = cmOutput.nbPatches;
+				contact.nbContacts = cmOutput.nbContacts;
+				contact.actor0 = transformCacheIdToActorTable[transformCacheRef0];
+				contact.actor1 = transformCacheIdToActorTable[transformCacheRef1];
+			}
+		}
+	}
+}
+
+// update frictionPatches CPU pointers from contactPatches CPU pointers
+extern "C" __global__ void updateFrictionPatches(
+	const PxU32								pairCount,				//input
+	const PxU8* PX_RESTRICT					startContactPatches,	//input
+	PxU8* PX_RESTRICT						startFrictionPatches,	//input (but we need it non-const)
+	PxsContactManagerOutput* PX_RESTRICT	outputs					//input/output
+)
+{
+	const PxU32 threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
+
+	if (threadIndex < pairCount)
+	{
+		PxsContactManagerOutput& output = outputs[threadIndex];
+		output.frictionPatches = startFrictionPatches + (output.contactPatches - startContactPatches) * sizeof(PxFrictionPatch) / sizeof(PxContactPatch);
+	}
+}
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/contactPatchUtils.cuh
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/contactPatchUtils.cuh
@@ -0,0 +1,182 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __CONTACT_PATCH_UTILS_CUH__
+#define __CONTACT_PATCH_UTILS_CUH__
+
+#include "materialCombiner.cuh"
+#include "PxsContactManagerState.h"
+#include "PxgContactManager.h"
+
+PX_FORCE_INLINE __device__ PxU32 setContactPointAndForcePointers(PxsContactManagerOutput* PX_RESTRICT cmOutputs, PxgPatchAndContactCounters* PX_RESTRICT patchAndContactCounters,
+	PxU8* PX_RESTRICT startContactPoints, PxU8* PX_RESTRICT startContactForces, PxU32 contactBytesLimit, PxU32 forceBytesLimit, PxU32 workIndex, PxU32 nbContacts)
+{
+	PxU32 contactByteOffset = 0xFFFFFFFF;
+	PxU32 forceAndIndiceByteOffset = 0xFFFFFFFF;
+
+	if (nbContacts)
+	{
+		PxsContactManagerOutput* output = &cmOutputs[workIndex];
+
+		contactByteOffset = atomicAdd(&(patchAndContactCounters->contactsBytes), sizeof(PxContact) * nbContacts);
+		forceAndIndiceByteOffset = atomicAdd(&(patchAndContactCounters->forceAndIndiceBytes), sizeof(PxU32) * nbContacts);
+
+		if ((contactByteOffset + sizeof(PxContact)) > contactBytesLimit)
+		{
+			patchAndContactCounters->setOverflowError(PxgPatchAndContactCounters::CONTACT_BUFFER_OVERFLOW);
+			contactByteOffset = 0xFFFFFFFF; //overflow
+		}
+
+		if ((forceAndIndiceByteOffset + sizeof(PxU32)) > forceBytesLimit)
+		{
+			patchAndContactCounters->setOverflowError(PxgPatchAndContactCounters::FORCE_BUFFER_OVERFLOW);
+			forceAndIndiceByteOffset = 0xFFFFFFFF; //overflow
+		}
+	
+		if (contactByteOffset != 0xFFFFFFFF)
+		{
+			output->contactForces = reinterpret_cast<PxReal*>(startContactForces + forceAndIndiceByteOffset);
+			output->contactPoints = startContactPoints + contactByteOffset;
+		}
+		else
+		{
+			output->contactForces = NULL;
+			output->contactPoints = NULL;
+		}
+	}
+
+	return contactByteOffset;
+}
+
+PX_FORCE_INLINE __device__ PxU32 registerContactPatch(PxsContactManagerOutput* PX_RESTRICT cmOutputs, PxgPatchAndContactCounters* PX_RESTRICT patchAndContactCounters,
+	PxU32* PX_RESTRICT touchChangeFlags, PxU32* PX_RESTRICT patchChangeFlags, PxU8* PX_RESTRICT startContactPatches, PxU32 patchBytesLimit, PxU32 workIndex, PxU32 nbContacts)
+{
+	PxU32 allflags = reinterpret_cast<PxU32*>(&((cmOutputs + workIndex)->allflagsStart))[0];
+
+	PxU8 oldStatusFlags = u16Low(u32High(allflags));
+	PxU8 statusFlags = oldStatusFlags;
+
+	statusFlags &= (~PxsContactManagerStatusFlag::eTOUCH_KNOWN);
+
+	if (nbContacts)
+		statusFlags |= PxsContactManagerStatusFlag::eHAS_TOUCH;
+	else
+		statusFlags |= PxsContactManagerStatusFlag::eHAS_NO_TOUCH;
+
+	PxU8 prevPatches = u16High(u32Low(allflags)); //Get out the current number of patches to store as the previous frame's number of patches
+	bool previouslyHadTouch = oldStatusFlags & PxsContactManagerStatusFlag::eHAS_TOUCH;
+	bool prevTouchKnown = oldStatusFlags & PxsContactManagerStatusFlag::eTOUCH_KNOWN;
+
+	PxU8 numPatches = (nbContacts > 0) ? 1 : 0;
+
+	bool currentlyHasTouch = nbContacts > 0;
+
+	const bool change = (previouslyHadTouch ^ currentlyHasTouch) || (!prevTouchKnown);
+	touchChangeFlags[workIndex] = change;
+	patchChangeFlags[workIndex] = (prevPatches != numPatches);
+
+	reinterpret_cast<PxU32*>(&((cmOutputs + workIndex)->allflagsStart))[0] = merge(merge(prevPatches, statusFlags),
+		merge(numPatches, 0));
+
+	cmOutputs[workIndex].nbContacts = nbContacts;
+
+	//fill in patch information
+	PxU32 patchIndex = 0xFFFFFFFF;
+	if (nbContacts)
+	{
+		patchIndex = atomicAdd(&(patchAndContactCounters->patchesBytes), sizeof(PxContactPatch));
+
+		if ((patchIndex + sizeof(PxContactPatch)) > patchBytesLimit)
+		{
+			//patchAndContactCounters->setOverflowError(PxgPatchAndContactCounters::PATCH_BUFFER_OVERFLOW);
+			//patchIndex = 0xFFFFFFFF; //overflow
+
+			patchAndContactCounters->setOverflowError(PxgPatchAndContactCounters::PATCH_BUFFER_OVERFLOW);
+			patchIndex = 0xFFFFFFFF; //overflow
+
+			statusFlags &= (~PxsContactManagerStatusFlag::eTOUCH_KNOWN);
+			statusFlags |= PxsContactManagerStatusFlag::eHAS_NO_TOUCH;
+
+			reinterpret_cast<PxU32*>(&((cmOutputs + workIndex)->allflagsStart))[0] = merge(merge(prevPatches, statusFlags), 0);
+			cmOutputs[workIndex].nbContacts = 0;
+
+			touchChangeFlags[workIndex] = previouslyHadTouch;
+			patchChangeFlags[workIndex] = prevPatches != 0;
+		}
+		else
+		{
+			(cmOutputs + workIndex)->contactPatches = startContactPatches + patchIndex;
+		}
+	}
+	return patchIndex;
+}
+
+PX_FORCE_INLINE __device__ void insertIntoPatchStream(const PxsMaterialData* PX_RESTRICT materials, PxU8* PX_RESTRICT patchStream, 
+	const PxgShape& shape0, const PxgShape& shape1, PxU32 patchIndex, const PxVec3& normal, PxU32 nbContacts)
+{
+	//write to patch to stream
+	if (patchIndex != 0xFFFFFFFF)
+	{
+		//fill in material
+		PxReal restitution, dynamicFriction, staticFriction, damping;
+		PxU32 materialFlags;
+
+		PxU16 materialIndex0 = shape0.materialIndex;
+		PxU16 materialIndex1 = shape1.materialIndex;
+
+		combineMaterials(materials, materialIndex0,
+			materialIndex1,
+			materialFlags,
+			staticFriction,
+			dynamicFriction,
+			restitution,
+			damping
+		);
+
+		PxContactPatch* patch = ((PxContactPatch*)(patchStream + patchIndex));
+		patch->normal = normal;
+		patch->nbContacts = nbContacts;
+		patch->startContactIndex = 0;
+		//KS - we could probably compress this further into the header but the complexity might not be worth it
+		patch->staticFriction = staticFriction;
+		patch->dynamicFriction = dynamicFriction;
+		patch->damping = damping;
+		patch->restitution = restitution;
+		patch->materialIndex0 = materialIndex0;
+		patch->materialIndex1 = materialIndex1;
+		assert(materialFlags <= PX_MAX_U8);
+		patch->materialFlags = PxU8(materialFlags);
+		patch->internalFlags = 0;
+		patch->mMassModification.linear0 = 1.0f;
+		patch->mMassModification.linear1 = 1.0f;
+		patch->mMassModification.angular0 = 1.0f;
+		patch->mMassModification.angular1 = 1.0f;
+	}
+}
+
+#endif
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/convexCoreCollision.cu
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/convexCoreCollision.cu
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/convexHFMidphase.cu
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/convexHFMidphase.cu
@@ -0,0 +1,394 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. 
+
+#include "foundation/PxBounds3.h"
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxTransform.h"
+
+#include "geometry/PxGeometry.h"
+#include "geometry/PxHeightFieldSample.h"
+
+#include "PxgContactManager.h"
+#include "PxgNpKernelIndices.h"
+#include "PxgPersistentContactManifold.h"
+#include "PxgSimulationCoreDesc.h"
+
+#include "PxsContactManagerState.h"
+#include "PxsTransformCache.h"
+
+#include "convexNpCommon.h"
+#include "cudaNpCommon.h"
+
+#include "PxgCommonDefines.h"
+#include "dataReadWriteHelper.cuh"
+#include "heightfieldUtil.cuh"
+#include "manifold.cuh"
+#include "midphaseAllocate.cuh"
+
+#include <vector_types.h>
+
+using namespace physx;
+
+extern "C" __host__ void initNarrowphaseKernels0() {}
+
+PX_ALIGN_PREFIX(16)
+struct HeigtFieldDataScratch
+{
+	PxTransform convexToHeightfieldNoScale;
+	PxU32 convexShape_materialIndex;
+	PxU32 heightfieldShape_materialIndex;	
+}PX_ALIGN_SUFFIX(16);
+
+
+template<unsigned int WarpsPerBlock>
+__device__ static inline void heightfieldMidphaseCore(
+	PxU32 numContactManagers,
+	const PxReal toleranceLength,
+	const PxgContactManagerInput* PX_RESTRICT cmInputs,
+	const PxsCachedTransform* PX_RESTRICT transformCache,
+	const PxBounds3* PX_RESTRICT bounds,
+	const PxReal* PX_RESTRICT contactDistance,
+	const PxgShape* PX_RESTRICT gpuShapes,
+
+	ConvexMeshPair* PX_RESTRICT cvxTrimeshPair,
+	PxgPersistentContactMultiManifold* PX_RESTRICT multiManifolds,
+	PxsContactManagerOutput * PX_RESTRICT cmOutputs,
+	uint4* PX_RESTRICT stackBasePtr,
+	PxU32* PX_RESTRICT nbPairsFound,
+	PxU32* PX_RESTRICT midphasePairsNum,
+	PxU32* PX_RESTRICT midphasePairsNumPadded,
+
+	HeigtFieldDataScratch*	s_warpScratch,
+	const PxU32 maxPairs
+	)
+{
+	//thread index in warp
+	const unsigned int idxInWarp = threadIdx.x;
+	//wrap index
+	const unsigned int warpIdx = threadIdx.y;
+
+	unsigned int cmIdx = warpIdx + blockIdx.x * blockDim.y;
+
+	//this is number of contact managers
+	PxU32 nbPairsPerCM = 0;
+
+	if (cmIdx < numContactManagers)
+	{
+		PxgContactManagerInput npWorkItem;
+		PxgContactManagerInput_ReadWarp(npWorkItem, cmInputs, cmIdx);
+
+		PxsCachedTransform transformCached, heightfieldTransformCached;
+		PxsCachedTransform_ReadWarp(transformCached, transformCache + npWorkItem.transformCacheRef0);
+		PxsCachedTransform_ReadWarp(heightfieldTransformCached, transformCache + npWorkItem.transformCacheRef1);
+
+		const PxTransform shapeToHeightfieldNoScale = heightfieldTransformCached.transform.transformInv(transformCached.transform);
+
+		//read convex/sphere shape
+		PxgShape shape;
+		PxgShape_ReadWarp(shape, gpuShapes + npWorkItem.shapeRef0);
+
+
+		PxReal ratio, minMargin, breakingThresholdRatio;
+		if (shape.type == PxGeometryType::eSPHERE)
+		{
+			minMargin = shape.scale.scale.x; //sphere radius
+			ratio = 0.02f;
+			breakingThresholdRatio = 0.05f;
+		}
+		else if (shape.type == PxGeometryType::eCAPSULE)
+		{
+			minMargin = shape.scale.scale.y; //capsule radius
+			ratio = 0.02;
+			breakingThresholdRatio = 0.05f;
+		}
+		else
+		{
+			PxU8* hullPtr = reinterpret_cast<PxU8*>(shape.hullOrMeshPtr);
+
+			const float4 extents4_f = *reinterpret_cast<float4*>(hullPtr + sizeof(float4) * 2);
+			minMargin = calculatePCMConvexMargin(extents4_f, shape.scale.scale, toleranceLength);
+			ratio = 0.2f;
+			breakingThresholdRatio = 0.8f;
+		}
+
+		/*PxU8* hullPtr = reinterpret_cast<PxU8*>(convexShape.hullOrMeshPtr);
+
+		const float4 extents4_f = *reinterpret_cast<float4*>(hullPtr + sizeof(float4) * 2);
+		const PxReal minMargin = calculatePCMConvexMargin(extents4_f, convexShape.scale.scale, toleranceLength);*/
+
+		bool lostContacts = false;
+
+		const bool invalidate = invalidateManifold(shapeToHeightfieldNoScale, multiManifolds[cmIdx], minMargin, ratio);
+
+		if (!invalidate)
+		{
+			const PxReal projectBreakingThreshold = minMargin * breakingThresholdRatio;
+
+			lostContacts = refreshManifolds(
+				shapeToHeightfieldNoScale,
+				projectBreakingThreshold,
+				multiManifolds + cmIdx
+				);
+		}
+
+		bool fullContactGen = invalidate || lostContacts;
+
+		//read heightfield shape
+		PxgShape heightFieldShape;
+		PxgShape_ReadWarp(heightFieldShape, gpuShapes + npWorkItem.shapeRef1);
+
+		if (idxInWarp == 0)
+		{
+			s_warpScratch->convexToHeightfieldNoScale = shapeToHeightfieldNoScale;
+			s_warpScratch->convexShape_materialIndex = shape.materialIndex;
+			s_warpScratch->heightfieldShape_materialIndex = heightFieldShape.materialIndex;
+		}
+
+		__syncwarp();
+
+		//if invalidate is true, generate full contacts 
+		if (fullContactGen)
+		{
+			
+			PxU32* heightfieldData = reinterpret_cast<PxU32*>(heightFieldShape.hullOrMeshPtr);
+			const PxU32 nbRows = heightfieldData[0];
+			const PxU32 nbCols = heightfieldData[1];
+			PxHeightFieldSample* samples = reinterpret_cast<PxHeightFieldSample*>(&heightfieldData[2]);
+
+
+			const PxReal oneOverHeightScale = 1.f / heightFieldShape.scale.scale.y;
+			const PxReal oneOverRowScale = 1.f / PxAbs(heightFieldShape.scale.scale.x);
+			const PxReal oneOverlColScale = 1.f / PxAbs(heightFieldShape.scale.scale.z);
+
+			PxBounds3 worldBound;
+			PxBounds3_ReadWarp(worldBound, bounds + npWorkItem.transformCacheRef0);
+			
+			//bound is in world space, we need to transform the bound to the local space of height field
+			PxBounds3 localBound = PxBounds3::transformFast(heightfieldTransformCached.transform.getInverse(), worldBound);
+			const PxReal contactDist = contactDistance[npWorkItem.transformCacheRef0] + contactDistance[npWorkItem.transformCacheRef1];
+			localBound.fattenFast(contactDist);
+
+			localBound.minimum.x *= oneOverRowScale;
+			localBound.minimum.y *= oneOverHeightScale;
+			localBound.minimum.z *= oneOverlColScale;
+
+			localBound.maximum.x *= oneOverRowScale;
+			localBound.maximum.y *= oneOverHeightScale;
+			localBound.maximum.z *= oneOverlColScale;
+
+			//row scale
+			if (heightFieldShape.scale.scale.x < 0.f)
+			{
+				//swap min and max row scale
+				const PxReal temp = localBound.minimum.x;
+				localBound.minimum.x = localBound.maximum.x;
+				localBound.maximum.x = temp;
+			}
+
+			//col scale
+			if (heightFieldShape.scale.scale.z < 0.f)
+			{
+				PxReal swap = localBound.minimum.z;
+				localBound.minimum.z = localBound.maximum.z;
+				localBound.maximum.z = swap;
+			}
+
+			bool boundsDontOverlap = false;
+
+			// this tests if the complete shape is outside of the bounds of the HF in the XZ plane.
+			if ((localBound.minimum.x > nbRows - 1) || (localBound.minimum.z > nbCols - 1)
+				|| (localBound.maximum.x < 0) || (localBound.maximum.z < 0)) 
+			{
+				boundsDontOverlap = true;
+			}
+
+			if (!boundsDontOverlap)
+			{
+				PxU32 minRow = getMinRow(localBound.minimum.x, nbRows);
+				PxU32 maxRow = getMaxRow(localBound.maximum.x, nbRows); 
+				PxU32 minColumn = getMinColumn(localBound.minimum.z, nbCols);
+				PxU32 maxColumn = getMaxColumn(localBound.maximum.z, nbCols);
+
+				bool noTriangles = false;
+
+				// AD: This test whether we have any triangles at all.
+				// Given the clamping above I can only see this happening if we have a 
+				// flat shape that lies exactly on one of the grid lines for sampling.
+
+				// Also, the 2x looks unnecessary here if my basic math isn't failing me.
+				// This is the same code as CPU and has been there since 2009, probably before,
+				// so I'm not going to change it now.
+				if ((2 * (maxColumn - minColumn) * (maxRow - minRow)) == 0)
+				{
+					noTriangles = true;
+				}
+
+				if (!noTriangles)
+				{
+
+					const PxReal miny = localBound.minimum.y;
+					const PxReal maxy = localBound.maximum.y;
+
+					const PxU32 columnSpan = maxColumn - minColumn;
+
+					//we have two materials corresponding to one vertexIndex, so each thread will deal with one of the materials
+					const PxU32 totalNumProcessed = (maxRow - minRow) * columnSpan * 2;
+					for (PxU32 i = 0; i < totalNumProcessed; i += WARP_SIZE)
+					{
+						bool result = false;
+						PxU32 triangleIndex = 0xFFffFFff;
+
+						const PxU32 workIndex = idxInWarp + i;
+
+						if (workIndex < totalNumProcessed)
+						{
+							const PxU32 index = workIndex / 2;
+							const PxU32 vertexIndex = (minRow + index / columnSpan) * nbCols + (minColumn + index % columnSpan);
+							assert(isValidVertex(vertexIndex, nbRows, nbCols));
+							PxReal h0 = getHeight(vertexIndex, samples);
+							PxReal h1 = getHeight(vertexIndex + 1, samples);
+							PxReal h2 = getHeight(vertexIndex + nbCols, samples);
+							PxReal h3 = getHeight(vertexIndex + nbCols + 1, samples);
+							const bool con0 = maxy < h0 && maxy < h1 && maxy < h2 && maxy < h3;
+							const bool con1 = miny > h0 && miny > h1 && miny > h2 && miny > h3;
+
+							if (!(con0 || con1))
+							{
+								const PxHeightFieldSample& sample = getSample(vertexIndex, samples);
+
+								const bool isMaterial1 = (workIndex & 1) ? 1 : 0;
+								PxU32 material = isMaterial1 ? sample.materialIndex1 : sample.materialIndex0;
+								if (material != PxHeightFieldMaterial::eHOLE)
+								{
+									triangleIndex = isMaterial1 ? ((vertexIndex << 1) + 1) : (vertexIndex << 1);
+									result = true;
+								}
+							}
+						}
+
+						PxU32 resultWarp = __ballot_sync(FULL_MASK, result);
+						PxU32 offset = warpScanExclusive(resultWarp, idxInWarp);
+						PxU32 validCount = __popc(resultWarp);
+
+						// Allocate only amount of memory, needed for single warp-wide write
+						PxU32 prevNbPairs = 0xFFffFFff;
+						if (idxInWarp == 0 && validCount > 0)
+						{
+							prevNbPairs = atomicAdd(nbPairsFound, validCount);
+						}
+
+						prevNbPairs = __shfl_sync(FULL_MASK, prevNbPairs, 0);
+
+						if (result && (prevNbPairs + offset) < maxPairs)
+						{
+							stackBasePtr[prevNbPairs + offset] = make_uint4(cmIdx, triangleIndex, nbPairsPerCM + offset, npWorkItem.shapeRef1);
+						}
+
+						if ((validCount > 0) && ((validCount + prevNbPairs) >= maxPairs))
+						{
+							validCount = PxMax(maxPairs, prevNbPairs) - prevNbPairs;
+						}
+
+						assert(((validCount + prevNbPairs) <= maxPairs) || (validCount == 0));
+
+						nbPairsPerCM += validCount;
+					}
+				} // noTriangles
+			} // boundsDontOverlap
+		}
+
+		PxU32 prevIntermArraysOffset = 0xFFffFFff;
+		PxU32 prevIntermArraysPaddedOffset = 0xFFffFFff;
+		if (idxInWarp == 0 && nbPairsPerCM > 0)
+		{
+			prevIntermArraysOffset = atomicAdd(midphasePairsNum, nbPairsPerCM);
+			prevIntermArraysPaddedOffset = atomicAdd(midphasePairsNumPadded, ((nbPairsPerCM + 3)&(~3)) * 2); // AD: we need 2x space for the radix sort.
+		}
+
+		prevIntermArraysOffset = __shfl_sync(FULL_MASK, prevIntermArraysOffset, 0);
+		prevIntermArraysPaddedOffset = __shfl_sync(FULL_MASK, prevIntermArraysPaddedOffset, 0);
+
+		ConvexMeshPair pairInfo;
+		pairInfo.aToB = s_warpScratch->convexToHeightfieldNoScale;
+		pairInfo.cmIndex = cmIdx;
+		pairInfo.startIndex = prevIntermArraysOffset;
+		pairInfo.count = fullContactGen ? nbPairsPerCM : CONVEX_TRIMESH_CACHED;
+		pairInfo.roundedStartIndex = prevIntermArraysPaddedOffset;
+		pairInfo.materialIndices = make_uint2(s_warpScratch->convexShape_materialIndex, s_warpScratch->heightfieldShape_materialIndex);
+
+		ConvexMeshPair_WriteWarp(cvxTrimeshPair + cmIdx, pairInfo);
+
+		assert(*midphasePairsNum <= maxPairs);
+	}
+}
+
+extern "C" __global__ void convexHeightFieldMidphase( 
+	PxU32 numContactManagers,
+	const PxReal toleranceLength,
+	const PxgContactManagerInput* PX_RESTRICT cmInputs,
+	const PxsCachedTransform* PX_RESTRICT transformCache,
+	const PxBounds3* PX_RESTRICT bounds,
+	const PxReal* PX_RESTRICT contactDistance,
+	const PxgShape* PX_RESTRICT gpuShapes,
+	
+	ConvexMeshPair* PX_RESTRICT cvxTrimeshPair,
+	PxgPersistentContactMultiManifold* PX_RESTRICT multiManifolds,
+	PxsContactManagerOutput* PX_RESTRICT cmOutputs,
+	PxU8* PX_RESTRICT stackPtr,
+	PxU32* PX_RESTRICT stackOffset,
+	PxU32* PX_RESTRICT midphasePairsNum,
+	PxU32* PX_RESTRICT midphasePairsNumPadded,
+	const PxU32 stackSizeBytes
+)
+{
+	__shared__ PxU32 scratchMem[MIDPHASE_WARPS_PER_BLOCK][WARP_SIZE * 2];
+
+	const PxU32 maxPairs = calculateMaxPairs(stackSizeBytes, numContactManagers);
+
+	heightfieldMidphaseCore<MIDPHASE_WARPS_PER_BLOCK>(
+
+		numContactManagers,
+		toleranceLength,
+		cmInputs,
+		transformCache,
+		bounds,
+		contactDistance,
+		gpuShapes,
+
+		cvxTrimeshPair,
+		multiManifolds,
+		cmOutputs,
+		reinterpret_cast<uint4*>(stackPtr),
+		stackOffset,
+		midphasePairsNum,
+		midphasePairsNumPadded,
+
+		(HeigtFieldDataScratch*)scratchMem[threadIdx.y],
+		maxPairs
+	);
+}
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/convexHeightfield.cu
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/convexHeightfield.cu
@@ -0,0 +1,997 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <assert.h>
+
+#include "foundation/PxVec3.h"
+#include "foundation/PxTransform.h"
+#include "geometry/PxMeshScale.h"
+
+#include "convexNpCommon.h"
+#include "cudaNpCommon.h"
+
+#include "PxgPersistentContactManifold.h"
+#include "PxgContactManager.h"
+#include "PxgConvexConvexShape.h"
+#include "PxgNpKernelIndices.h"
+#include "PxsTransformCache.h"
+
+#include "PxgCommonDefines.h"
+#include "reduction.cuh"
+#include "nputils.cuh"
+#include "midphaseAllocate.cuh"
+#include "dataReadWriteHelper.cuh"
+#include "convexTriangle.cuh"
+#include "heightfieldUtil.cuh"
+#include "sphereTriangle.cuh"
+#include "sphereCollision.cuh"
+#include "capsuleTriangle.cuh"
+#include "geometry/PxHeightFieldFlag.h"
+#include "geometry/PxGeometry.h"
+
+
+using namespace physx;
+
+extern "C" __host__ void initNarrowphaseKernels1() {}
+
+////ML: for heightfield, the maximum numbers of adjacent verts will be 6
+//#define	HEIGHTFIELD_MAX_ADJACENCY_VERT_INDICE				6
+//#define	HEIGHTFIELD_MAX_PROCESS_ADJACENCY_VERT_INDICE		9
+#define	HEIGHTFIELD_MAX_VERTS								18	//allow duplication
+//
+//#define	HEIGHTFIELD_DEBUG									1
+
+struct PxgTriangle
+{
+
+public:
+	__device__ PX_FORCE_INLINE	PxVec3	denormalizedNormal() const
+	{
+		return (verts[1] - verts[0]).cross(verts[2] - verts[0]);
+	}
+
+	PxVec3 verts[3];
+};
+
+__device__ static void getTriangle(PxgTriangle& triLoc, PxU32* adjacencyIndices, const PxU32 triangleIndex, const PxMeshScale& scale, const PxU32 rows, const PxU32 columns, const PxHeightFieldSample* samples)
+{
+
+	const PxReal rowScale = scale.scale.x;
+	const PxReal heightScale = scale.scale.y;
+	const PxReal columnScale = scale.scale.z;
+	PxVec3 handedness(1.0f);	// Vector to invert normal coordinates according to the heightfield scales
+	bool wrongHanded = false;
+	if (columnScale < 0.f)
+	{
+		wrongHanded = !wrongHanded;
+		handedness.z = -1.0f;
+	}
+	if (rowScale < 0.f)
+	{
+		wrongHanded = !wrongHanded;
+		handedness.x = -1.0f;
+	}
+
+	PxU32 tVertexIndices[3];
+	getTriangleVertexIndices(triangleIndex, tVertexIndices[0], tVertexIndices[1 + wrongHanded], tVertexIndices[2 - wrongHanded], columns, samples);
+
+
+	if (adjacencyIndices)
+	{
+		getTriangleAdjacencyIndices(triangleIndex, adjacencyIndices[wrongHanded ? 2 : 0], adjacencyIndices[1], adjacencyIndices[wrongHanded ? 0 : 2], rows, columns, samples);
+	}
+
+	for (PxU32 vi = 0; vi < 3; vi++)
+	{
+		const PxVec3 vertex = getVertex(tVertexIndices[vi], columns, samples);
+		triLoc.verts[vi] = hf2shapep(vertex, rowScale, heightScale, columnScale);
+	}
+}
+
+
+
+__device__ void buildTriangleInformations(const PxU32 triangleIdx, PxgShape& heightfieldShape, ConvexMeshScratch* s_scratch,
+	PxTransform& trimeshToConvexTransform)
+{
+	const PxU32 * heightfieldGeomPtr = reinterpret_cast<const PxU32 *>(heightfieldShape.hullOrMeshPtr);
+
+	const PxU32 rows = *heightfieldGeomPtr++;
+	const PxU32 columns = *heightfieldGeomPtr++;
+	const PxHeightFieldSample* samples = reinterpret_cast<const PxHeightFieldSample*>(heightfieldGeomPtr);
+
+
+	//PxU32 vertexIndices[3];
+	PxU32 adjacencyIndices[3];
+	PxgTriangle triLocal; // height field local space triangle
+	getTriangle(triLocal, adjacencyIndices, triangleIdx, heightfieldShape.scale, rows, columns, samples);
+
+
+	const PxVec3 triNormal = triLocal.denormalizedNormal();
+	const PxVec3 triangleLocNormal = trimeshToConvexTransform.rotate(triNormal).getNormalized();
+
+	if (threadIdx.x < 1)
+	{
+		s_scratch->triangleLocNormal = triangleLocNormal;
+		s_scratch->trimeshToConvexTransform = trimeshToConvexTransform;
+		s_scratch->trimeshVerts = NULL;
+		//s_scratch->trimeshVerts = trimeshVerts;
+	}
+
+
+	if (threadIdx.x < 3)
+	{
+		//assign adjacent triangle indices to the scratch memory
+		reinterpret_cast<PxU32*>(&s_scratch->triAdjTrisIdx)[threadIdx.x] = adjacencyIndices[threadIdx.x];
+		
+
+		//assign triangle vertes in the local space of convex hull to the scratch memory
+		s_scratch->triLocVerts[threadIdx.x] = trimeshToConvexTransform.transform(triLocal.verts[threadIdx.x]);
+	}
+}
+
+
+__device__ void convexHeightfieldNarrowphaseCore(
+	PxU32 globalWarpIndex,
+	const PxgContactManagerInput* PX_RESTRICT cmInputs,
+	const PxsCachedTransform* PX_RESTRICT transformCache,
+	const PxReal* PX_RESTRICT contactDistance,
+	const PxgShape* PX_RESTRICT gpuShapes,
+
+	ConvexMeshPair* PX_RESTRICT cvxTrimeshPair,						// per CM
+	ConvexTriNormalAndIndex** PX_RESTRICT cvxTriNIPtr,				// per cvx-tri
+	ConvexTriContacts** PX_RESTRICT cvxTriContactsPtr,				// per cvx-tri
+	PxReal** PX_RESTRICT cvxTriMaxDepthPtr,							// per cvx-tri
+	ConvexTriIntermediateData** PX_RESTRICT cvxTriIntermPtr,		// per cvx-tri
+	PxU32** PX_RESTRICT orderedCvxTriIntermPtr,	// per cvx-tri
+	PxU32** PX_RESTRICT cvxTriSecondPassPairPtr,					// per cvx-tri
+
+	const uint4 * PX_RESTRICT pairsGPU,
+	PxU32* PX_RESTRICT nbPairsGlobal,
+	PxU32* PX_RESTRICT nbSecondPassPairs,
+	ConvexTriContact* tempConvexTriContacts,
+	const PxU32 tempContactSizeBytes,
+	PxU32* pTempContactIndex
+	)
+{
+	const PxU32 warpsPerBlock = NP_TRIMESH_WARPS_PER_BLOCK;
+	const PxU32 wrapIndex = threadIdx.y;
+
+	//for heightfield, each vertex in a triangle has maximum 6 adacent vertexes, which invoke 12 vertexes in total
+	//__shared__ PxU32 sTrimeshAdjVerts[warpsPerBlock * HEIGHTFIELD_MAX_VERTS];
+	__shared__ volatile PxU32 sharedMem[warpsPerBlock][ WARP_SIZE * 16];
+	volatile PxU32* s_WarpSharedMemory = sharedMem[wrapIndex];
+
+	ConvexMeshScratch* s_scratch = (ConvexMeshScratch*)s_WarpSharedMemory;
+
+	// Pair
+	uint4 curPair = pairsGPU[globalWarpIndex];
+
+	PxU32 cmIdx = curPair.x;
+	PxU32 triangleIdx = curPair.y;
+	PxU32 testOffset = curPair.z;
+
+	PxgContactManagerInput npWorkItem;
+	PxgContactManagerInput_ReadWarp(npWorkItem, cmInputs, cmIdx);
+
+	PxsCachedTransform convexTransformCached, trimeshTransformCached;
+
+	PxsCachedTransform_ReadWarp(convexTransformCached, transformCache + npWorkItem.transformCacheRef0);
+	PxsCachedTransform_ReadWarp(trimeshTransformCached, transformCache + npWorkItem.transformCacheRef1);
+
+	PxTransform trimeshToConvexTransform(convexTransformCached.transform.transformInv(trimeshTransformCached.transform));
+
+	PxgShape convexShape;
+	PxgShape_ReadWarp(convexShape, gpuShapes + npWorkItem.shapeRef0);
+
+	PxgShape heightfieldShape;
+	PxgShape_ReadWarp(heightfieldShape, gpuShapes + npWorkItem.shapeRef1);
+
+	/*const PxU32 startIndex = wrapIndex * HEIGHTFIELD_MAX_VERTS;
+	PxU32* startTrimeshAdjVerts = &sTrimeshAdjVerts[startIndex];
+	float4* startTrimeshVerts = &sTrimeshVerts[startIndex];
+	buildAdjacencyInformations(triangleIdx, heightfieldShape, startTrimeshAdjVerts, startTrimeshVerts, s_scratch, trimeshToConvexTransform);*/
+	buildTriangleInformations(triangleIdx, heightfieldShape, s_scratch, trimeshToConvexTransform);
+
+	ConvexMeshPair& pair = cvxTrimeshPair[cmIdx];
+	PxU32 convexTriPairOffset = pair.startIndex + testOffset;
+	PxU32 convexTriPairOffsetPadded = pair.roundedStartIndex + testOffset;
+	// Geometries : Convex
+
+	if (threadIdx.x < 7)
+	{
+		reinterpret_cast<PxU32*>(&s_scratch->convexScale)[threadIdx.x] = reinterpret_cast<PxU32*>(&convexShape.scale)[threadIdx.x];
+	}
+
+	if (threadIdx.x == 0)
+	{
+		// Shapes
+		//s_scratch->contactDist = convexShape.contactOffset + heightfieldShape.contactOffset;
+		s_scratch->contactDist = contactDistance[npWorkItem.transformCacheRef0] + contactDistance[npWorkItem.transformCacheRef1];
+		s_scratch->trimeshToConvexTransform = trimeshToConvexTransform;
+		
+		const PxU8* convexPtrA = reinterpret_cast<const PxU8 *>(convexShape.hullOrMeshPtr);
+		s_scratch->convexPtrA = convexPtrA;
+
+		const float4 hull0_centerOfMass_f4 = *reinterpret_cast<const float4 *>(convexPtrA);
+		const PxVec3 hull0_centerOfMass(hull0_centerOfMass_f4.x, hull0_centerOfMass_f4.y, hull0_centerOfMass_f4.z);
+		// Transform CoM into shape space
+		PxVec3 shapeSpaceCenterOfMass0 = vertex2Shape(hull0_centerOfMass, convexShape.scale.scale, convexShape.scale.rotation);
+
+		s_scratch->convexCenterOfMass = shapeSpaceCenterOfMass0;
+
+		convexPtrA += sizeof(float4);
+
+		const uint4 tmp = *((uint4*)convexPtrA);
+		const PxU32 polyData0_NbEdgesNbHullVerticesNbPolygons = tmp.x;
+		s_scratch->nbEdgesNbHullVerticesNbPolygons = polyData0_NbEdgesNbHullVerticesNbPolygons;
+
+		convexPtrA += sizeof(uint4);
+
+		/*const float4 polyData0_Extents = *((float4*)(convexPtrA));
+		PxVec3 convex_extents = toVec3(polyData0_Extents);
+		s_scratch->extents = convex_extents;*/
+	}
+	
+	__syncwarp();
+
+	//height field don't have cpu remap triangle index
+	convexTriangleContactGen(
+		s_WarpSharedMemory, s_scratch, convexTriPairOffset, convexTriPairOffsetPadded, triangleIdx, triangleIdx, globalWarpIndex,
+		cvxTriNIPtr, cvxTriContactsPtr,cvxTriMaxDepthPtr, cvxTriIntermPtr, orderedCvxTriIntermPtr, cvxTriSecondPassPairPtr, nbSecondPassPairs,
+		tempConvexTriContacts, tempContactSizeBytes / sizeof(ConvexTriContact), pTempContactIndex);
+}
+
+extern "C" __global__
+__launch_bounds__(NP_TRIMESH_WARPS_PER_BLOCK * WARP_SIZE, 32 / NP_TRIMESH_WARPS_PER_BLOCK)
+void convexHeightfieldNarrowphase( 
+	const PxgContactManagerInput* PX_RESTRICT cmInputs,
+	const PxsCachedTransform* PX_RESTRICT transformCache,
+	const PxReal* PX_RESTRICT contactDistance,
+	const PxgShape* PX_RESTRICT gpuShapes,
+	ConvexMeshPair* PX_RESTRICT cvxTrimeshPair,							// per CM
+
+	ConvexTriNormalAndIndex** PX_RESTRICT cvxTriNIPtr,					// per cvx-tri
+	ConvexTriContacts** PX_RESTRICT cvxTriContactsPtr,					// per cvx-tri
+	PxReal** PX_RESTRICT cvxTriMaxDepthPtr,								// per cvx-tri
+	ConvexTriIntermediateData** PX_RESTRICT cvxTriIntermPtr,			// per cvx-tri
+	PxU32** PX_RESTRICT orderedCvxTriIntermPtr,		// per cvx-tri
+	PxU32** PX_RESTRICT cvxTriSecondPassPairsPtr,						// per cvx-tri
+
+	PxU8* PX_RESTRICT stackPtr,
+	PxU32* PX_RESTRICT nbPairsGlobal,
+	PxU32* PX_RESTRICT nbPaddedPairsGlobal,
+	PxU32* PX_RESTRICT nbSecondPassPairs,
+	const PxU32 stackSizeBytes,
+	ConvexTriContact* tempConvexTriContacts,
+	PxU32* pTempContactIndex,
+	PxU32* maxTempMemRequirement,
+	PxU32* midphasePairsNeeded,
+	const PxU32 nbContactManagers
+)
+{
+	__shared__ ConvexTriNormalAndIndex* sCvxTriNIPtr;
+	__shared__ ConvexTriContacts* sCvxTriContactsPtr;
+	__shared__ PxReal*  sCvxTriMaxDepthPtr;
+	__shared__ ConvexTriIntermediateData* sCvxTriIntermPtr;
+	__shared__ PxU32* sOrderedCvxTriIntermPtr;
+	__shared__ uint4*	sPairsGPU;
+	__shared__ PxU32*	sCvxTriSecondPassPairsPtr;
+
+	const PxU32 nbPairs = *nbPairsGlobal;
+	const PxU32 nbPaddedPairs = *nbPaddedPairsGlobal;
+
+	//each block assign the corresponding ptr from the stack memory to sCvxTriNIPtr, sCvxTriContactPtr and sCvxTriMaxDepthPtr
+	if (threadIdx.x == 0 && threadIdx.y == 0)
+	{
+		midphaseAllocate(&sCvxTriNIPtr, &sCvxTriContactsPtr, &sCvxTriMaxDepthPtr, &sCvxTriIntermPtr, &sOrderedCvxTriIntermPtr, &sCvxTriSecondPassPairsPtr, &sPairsGPU, stackPtr, nbPairs, nbPaddedPairs);
+		if (blockIdx.x == 0)
+		{
+			atomicMax(maxTempMemRequirement, calculateConvexMeshPairMemRequirement() * (*midphasePairsNeeded) + calculateAdditionalPadding(nbContactManagers));
+		}
+	}
+	__syncthreads();
+
+	if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0)
+	{
+		*cvxTriNIPtr = sCvxTriNIPtr;
+		*cvxTriContactsPtr = sCvxTriContactsPtr;
+		*cvxTriMaxDepthPtr = sCvxTriMaxDepthPtr;
+		*cvxTriIntermPtr = sCvxTriIntermPtr;
+		*orderedCvxTriIntermPtr = sOrderedCvxTriIntermPtr;
+		*cvxTriSecondPassPairsPtr = sCvxTriSecondPassPairsPtr;
+	}
+	
+	for (PxU32 globalWarpIndex = blockIdx.x * blockDim.y + threadIdx.y; globalWarpIndex < nbPairs; globalWarpIndex += gridDim.x * blockDim.y)
+	{
+		convexHeightfieldNarrowphaseCore(
+			globalWarpIndex,
+
+			cmInputs,
+			transformCache,
+			contactDistance,
+			gpuShapes,
+			cvxTrimeshPair,
+
+			&sCvxTriNIPtr,
+			&sCvxTriContactsPtr,
+			&sCvxTriMaxDepthPtr,
+			&sCvxTriIntermPtr,
+			&sOrderedCvxTriIntermPtr,
+			&sCvxTriSecondPassPairsPtr,
+
+			sPairsGPU,
+			nbPairsGlobal,
+			nbSecondPassPairs,
+			tempConvexTriContacts,
+			stackSizeBytes,
+			pTempContactIndex
+			);
+	}
+}
+
+__device__ PxU32 sphereHeightfieldNarrowphaseCore(
+	PxU32 globalThreadIndex,
+	const PxgContactManagerInput* PX_RESTRICT cmInputs,
+	const PxsCachedTransform* PX_RESTRICT transformCache,
+	const PxReal* PX_RESTRICT contactDistance,
+	const PxgShape* PX_RESTRICT gpuShapes,
+
+	SphereMeshPair* PX_RESTRICT	sphereTrimeshPair,						// per CM
+	SphereTriNormalAndIndex** PX_RESTRICT sphereTriNIPtr,				// per sphere-tri
+	SphereTriContacts** PX_RESTRICT sphereTriContactsPtr,				// per sphere-tri
+	PxReal** PX_RESTRICT sphereTriMaxDepthPtr,							// per sphere-tri
+	SphereTriIntermediateData** PX_RESTRICT sphereTriIntermPtr,			// per sphere-tri
+	PxU32** PX_RESTRICT orderedSphereTriIntermPtr,	// per sphere-tri
+	PxU32** PX_RESTRICT sphereTriSecondPassPairPtr,						// per sphere-tri
+
+	const uint4& curPair,
+	const PxgContactManagerInput& npWorkItem,
+	PxgShape& sphereShape,
+	PxU32* PX_RESTRICT nbPairsGlobal,
+	PxU32* PX_RESTRICT nbSecondPassPairs,
+	PxVec3& outContact,
+	PxReal& outSep,
+	PxU32& outMask
+)
+{
+	// Pair
+	PxU32 cmIdx = curPair.x;
+	PxU32 triangleIdx = curPair.y;
+	PxU32 testOffset = curPair.z;
+
+	PxsCachedTransform sphereTransformCached = transformCache[npWorkItem.transformCacheRef0];
+	PxsCachedTransform trimeshTransformCached = transformCache[npWorkItem.transformCacheRef1];
+
+	//PxTransform trimeshToConvexTransform(sphereTransformCached.transform.transformInv(trimeshTransformCached.transform));
+
+	PxgShape heightfieldShape = gpuShapes[npWorkItem.shapeRef1];
+	//const PxU32 startIndex = threadIdx.x * HEIGHTFIELD_MAX_VERTS;
+	//float4* startTrimeshVerts = &sTrimeshVerts[startIndex];
+
+	//buildTriangleInformations
+	const PxU32 * heightfieldGeomPtr = reinterpret_cast<const PxU32 *>(heightfieldShape.hullOrMeshPtr);
+
+	const PxU32 rows = *heightfieldGeomPtr++;
+	const PxU32 columns = *heightfieldGeomPtr++;
+	const PxHeightFieldSample* samples = reinterpret_cast<const PxHeightFieldSample*>(heightfieldGeomPtr);
+
+	//PxU32 vertexIndices[3];
+	uint4 triAdjTriIndices;
+	//PxgTriangle triLocal; // height field local space triangle
+	PxVec3 triLocV0, triLocV1, triLocV2;
+	getTriangle(triLocV0, triLocV1, triLocV2, &triAdjTriIndices, triangleIdx, heightfieldShape.scale, rows, columns, samples);
+
+	//printf("threadIdx.x %i triangleIdx %i\n", threadIdx.x, triangleIdx);
+
+
+	const PxVec3 triNormal = ((triLocV1 - triLocV0).cross(triLocV2 - triLocV0)).getNormalized();
+	//const PxVec3 triNormal = triLocal.denormalizedNormal().getNormalized();
+	//const PxVec3 triangleLocNormal = trimeshToConvexTransform.rotate(triNormal).getNormalized();
+
+
+	ConvexMeshPair& pair = sphereTrimeshPair[cmIdx];
+	PxU32 sphereTriPairOffset = pair.startIndex + testOffset;
+	PxU32 sphereTriPairOffsetPadded = pair.roundedStartIndex + testOffset;
+	
+
+	// Shapes
+	const PxReal contactDist = contactDistance[npWorkItem.transformCacheRef0] + contactDistance[npWorkItem.transformCacheRef1];
+	const PxReal sphereRadius = sphereShape.scale.scale.x;
+
+	//Geometries : Sphere
+	const PxVec3 sphereCenterInTriMesh = trimeshTransformCached.transform.transformInv(sphereTransformCached.transform.p);
+
+
+	const PxReal inflatedRadius = sphereRadius + contactDist;
+
+	//const PxReal d = triLocal.verts[0].dot(triNormal);
+	//const PxReal dist0 = sphereCenterInTriMesh.dot(triNormal) - d;
+	const PxReal dist = (sphereCenterInTriMesh - triLocV0).dot(triNormal);
+
+	PxReal separation = PX_MAX_F32;
+	PxU32 nbOutputContacts = 0;
+	PxVec3 patchNormal(0.f);
+	PxVec3 closestP;
+	//mSphereCenter will be in the local space of the triangle mesh
+	bool generateContact = false;
+	bool faceContact = false;
+	bool delayContacts = false;
+	PxU32 mask;
+	PxReal sqDist = distancePointTriangleSquared(sphereCenterInTriMesh, triLocV0, triLocV1, triLocV2, triAdjTriIndices, closestP, generateContact, faceContact, &mask);
+
+	
+
+	
+	const PxReal sqInflatedSphereRadius = inflatedRadius * inflatedRadius;
+	//sphere overlap with triangles
+	if (dist >= 0.f && sqInflatedSphereRadius > sqDist && generateContact)
+	{
+		//printf("triangleIdx %i sqDist %f\n", triangleIdx, sqDist);
+		//printf("triangleIdx %i triNormal(%f, %f, %f)\n", triangleIdx, triNormal.x, triNormal.y, triNormal.z);
+		//printf("triangleIdx % i closestP(%f, %f, %f)\n", triangleIdx, closestP.x, closestP.y, closestP.z);
+
+
+		//printf("triangleIdx %i v0(%f, %f, %f)\n", triangleIdx, triLocal.verts[0].x, triLocal.verts[0].y, triLocal.verts[0].z);
+		//printf("triangleIdx %i v1(%f, %f, %f)\n", triangleIdx, triLocal.verts[1].x, triLocal.verts[1].y, triLocal.verts[1].z);
+		//printf("triangleIdx %i v2(%f, %f, %f)\n", triangleIdx, triLocal.verts[2].x, triLocal.verts[2].y, triLocal.verts[2].z);
+
+		//printf("remapCpuTriangleIdx %i sqInflatedSphereRadius %f sqDist %f faceContact %i\n", remapCpuTriangleIdx, sqInflatedSphereRadius, sqDist, PxU32(faceContact));
+
+		switch (mask)
+		{
+		case ConvexTriIntermediateData::eE01:
+		{
+			PxVec3 triLocV0Adj, triLocV1Adj, triLocV2Adj;
+			getTriangle(triLocV0Adj, triLocV1Adj, triLocV2Adj, NULL, triAdjTriIndices.x, heightfieldShape.scale, rows, columns, samples);
+			const PxVec3 triNormalAdj = ((triLocV1Adj - triLocV0Adj).cross(triLocV2Adj - triLocV0Adj)).getNormalized();
+
+			generateContact = ((triNormalAdj.dot(triLocV2 - triLocV0Adj)) < 0.f);
+
+			break;
+		}
+		case ConvexTriIntermediateData::eE12:
+		{
+			PxVec3 triLocV0Adj, triLocV1Adj, triLocV2Adj;
+			getTriangle(triLocV0Adj, triLocV1Adj, triLocV2Adj, NULL, triAdjTriIndices.y, heightfieldShape.scale, rows, columns, samples);
+			const PxVec3 triNormalAdj = ((triLocV1Adj - triLocV0Adj).cross(triLocV2Adj - triLocV0Adj)).getNormalized();
+
+			/*printf("%x: triNormalAdj = (%f, %f, %f), triNormal = (%f, %f, %f)\n", triangleIdx, triNormalAdj.x, triNormalAdj.y, triNormalAdj.z, 
+				triNormal.x, triNormal.y, triNormal.z);*/
+
+			generateContact = (triNormalAdj.dot(triLocV0 - triLocV0Adj) < 0.f);
+			
+			break;
+		}
+		case ConvexTriIntermediateData::eE02:
+		{
+			PxVec3 triLocV0Adj, triLocV1Adj, triLocV2Adj;
+			getTriangle(triLocV0Adj, triLocV1Adj, triLocV2Adj, NULL, triAdjTriIndices.z, heightfieldShape.scale, rows, columns, samples);
+			const PxVec3 triNormalAdj = ((triLocV1Adj - triLocV0Adj).cross(triLocV2Adj - triLocV0Adj)).getNormalized();
+
+			generateContact = ((triNormalAdj.dot(triLocV1 - triLocV0Adj)) < 0.f);
+			break;
+		}
+		
+		}
+
+		if (generateContact)
+		{
+
+			patchNormal = triNormal;
+			if (!faceContact)
+				patchNormal = (sphereCenterInTriMesh - closestP).getNormalized();
+
+			//const PxU32 value = PxU32(faceContact);
+			//printf("triangleIdx %i faceContact %i\n", triangleIdx, value);
+			//printf("remapCpuTriangleIdx %i closestP(%f, %f, %f)\n", remapCpuTriangleIdx, closestP.x, closestP.y, closestP.z);
+
+			const PxReal cosTheta = patchNormal.dot(triNormal);
+			const PxReal tolerance = 0.996f;//around 5 degree
+
+											//two normal's projection less than 5 degree, generate contacts
+			delayContacts = cosTheta <= tolerance;
+			if (delayContacts)
+			{
+				//delay contacts
+				/*PxU32* PX_RESTRICT sphereTriSecondPairPass = *sphereTriSecondPassPairPtr;
+				const PxU32 startIndex = atomicAdd(nbSecondPassPairs, 1);
+				sphereTriSecondPairPass[startIndex] = globalThreadIndex;*/
+				outMask = globalThreadIndex | mask;
+				//printf("remapCpuTriangleIdx %i gpuTriangleIdx %i delayContacts\n", remapCpuTriangleIdx, triangleIdx);
+			}
+
+			//printf("triangleIdx % i delayContacts %i\n", triangleIdx, PxU32(delayContacts));
+
+			nbOutputContacts = 1;
+			separation = PxSqrt(sqDist);
+
+			outContact = PxVec3(0.f);
+			outSep = separation;
+		}
+
+	}
+
+	PxReal* PX_RESTRICT sphereTriMaxDepth = *sphereTriMaxDepthPtr;
+	sphereTriMaxDepth[sphereTriPairOffset] = separation;
+
+	SphereTriIntermediateData* PX_RESTRICT sphereTriInterm = *sphereTriIntermPtr;
+	PxU32* PX_RESTRICT orderedSphereTriInterm = *orderedSphereTriIntermPtr;
+	sphereTriInterm[sphereTriPairOffset].gpuTriIndex = triangleIdx;
+	orderedSphereTriInterm[sphereTriPairOffsetPadded] = (nbOutputContacts && !delayContacts) ? (0x80000000 | triangleIdx) : triangleIdx;
+
+	SphereTriNormalAndIndex* PX_RESTRICT sphereTriNI = *sphereTriNIPtr;
+
+	PxU32 delayContactMask = delayContacts ? SphereTriNormalAndIndex::DeferredContactMask : 0;
+
+	//rotate the normal into A space 
+	PxVec3 worldNormal = trimeshTransformCached.transform.rotate(-patchNormal);
+	sphereTriNI[sphereTriPairOffset].normal = sphereTransformCached.transform.rotateInv(worldNormal);
+	sphereTriNI[sphereTriPairOffset].index = delayContactMask + (nbOutputContacts << SphereTriNormalAndIndex::NbContactsShift) + triangleIdx;
+
+	//printf("triangleIdx %i sphereTriPairOffset %i\n", triangleIdx, sphereTriPairOffset);
+
+	return nbOutputContacts;
+
+}
+
+
+__device__ PxU8 computeTriangleFlags(
+	const PxgTriangle& currentTriangle, 
+	const PxVec3& triNormal,
+	const PxU32* adjTriIndices,
+	const PxU16 flags,
+	const PxMeshScale& scale, 
+	const PxU32 rows, 
+	const PxU32 columns, 
+	const PxHeightFieldSample* samples
+	)
+{
+	const bool boundaryCollisions = !(flags & PxHeightFieldFlag::eNO_BOUNDARY_EDGES);
+	PxU8 triFlags = 0;
+	const PxU8 nextInd[] = { 2,0,1 };
+	//fill in triangle flag
+	for (PxU32 a = 0; a < 3; ++a)
+	{
+		if (adjTriIndices[a] != 0xFFFFFFFF)
+		{
+			PxgTriangle adjTri;
+
+			getTriangle(adjTri, NULL, adjTriIndices[a], scale, rows, columns, samples);
+
+			PxVec3 adjNormal = adjTri.denormalizedNormal();
+			PxU32 otherIndex = nextInd[a];
+			PxReal projD = adjNormal.dot(currentTriangle.verts[otherIndex] - adjTri.verts[0]);
+			if (projD < 0.f)
+			{
+				adjNormal.normalize();
+
+				PxReal proj = adjNormal.dot(triNormal);
+
+				if (proj < 0.997f)
+				{
+					triFlags |= (1 << (a + 3));
+				}
+			}
+			else if (boundaryCollisions)
+			{
+				triFlags |= (1 << (a + 3)); //Mark boundary edge active
+			}
+			else
+				triFlags |= (1 << a); //Mark as silhouette edge
+
+		}
+	}
+
+	return triFlags;
+
+}
+
+
+__device__ PxU32 capsuleHeightfieldNarrowphaseCore(
+	PxU32 globalThreadIndex,
+	const PxgContactManagerInput* PX_RESTRICT cmInputs,
+	const PxsCachedTransform* PX_RESTRICT transformCache,
+	const PxReal* PX_RESTRICT contactDistance,
+	const PxgShape* PX_RESTRICT gpuShapes,
+
+	SphereMeshPair* PX_RESTRICT	sphereTrimeshPair,						// per CM
+	SphereTriNormalAndIndex** PX_RESTRICT sphereTriNIPtr,				// per sphere-tri
+	SphereTriContacts** PX_RESTRICT sphereTriContactsPtr,				// per sphere-tri
+	PxReal** PX_RESTRICT sphereTriMaxDepthPtr,							// per sphere-tri
+	SphereTriIntermediateData** PX_RESTRICT sphereTriIntermPtr,			// per sphere-tri
+	PxU32** PX_RESTRICT orderedSphereTriIntermPtr,	// per sphere-tri
+	PxU32** PX_RESTRICT sphereTriSecondPassPairPtr,						// per sphere-tri
+
+	const uint4& curPair,
+	const PxgContactManagerInput& npWorkItem,
+	PxgShape& capsuleShape,
+	PxVec3* contacts,
+	PxReal* separations
+)
+{
+	// Pair
+	const PxU32 cmIdx = curPair.x;
+	PxU32 triangleIdx = curPair.y;
+	PxU32 testOffset = curPair.z;
+
+	PxsCachedTransform capsuleTransformCached = transformCache[npWorkItem.transformCacheRef0];
+	PxsCachedTransform trimeshTransformCached = transformCache[npWorkItem.transformCacheRef1];
+
+	PxgShape heightfieldShape = gpuShapes[npWorkItem.shapeRef1];
+
+	//buildTriangleInformations
+	const PxU32 * heightfieldGeomPtr = reinterpret_cast<const PxU32 *>(heightfieldShape.hullOrMeshPtr);
+
+	const PxU32 rows = *heightfieldGeomPtr++;
+	const PxU32 columns = *heightfieldGeomPtr++;
+	const PxHeightFieldSample* samples = reinterpret_cast<const PxHeightFieldSample*>(heightfieldGeomPtr);
+
+	heightfieldGeomPtr += sizeof(PxU32) * rows * columns;
+	
+	const PxU16 flags = reinterpret_cast<PxU16&>(heightfieldGeomPtr);
+
+	
+	PxU32 adjTriIndices[3];
+	PxgTriangle currentTriangle; // height field local space triangle
+	getTriangle(currentTriangle, adjTriIndices, triangleIdx, heightfieldShape.scale, rows, columns, samples);
+
+
+	ConvexMeshPair& pair = sphereTrimeshPair[cmIdx];
+	PxU32 sphereTriPairOffset = pair.startIndex + testOffset;
+	PxU32 sphereTriPairOffsetPadded = pair.roundedStartIndex + testOffset;
+
+	// Shapes
+	const PxReal contactDist = contactDistance[npWorkItem.transformCacheRef0] + contactDistance[npWorkItem.transformCacheRef1];
+	const PxReal capsuleRadius = capsuleShape.scale.scale.y;
+	const PxReal capsuleHalfHeight = capsuleShape.scale.scale.x;
+
+
+	const PxTransform heighfieldToCapsule = capsuleTransformCached.transform.transformInv(trimeshTransformCached.transform);
+
+	const PxVec3 tmp = capsuleTransformCached.transform.q.getBasisVector0() * capsuleHalfHeight;
+	const PxVec3 capsuleCenterInMesh = trimeshTransformCached.transform.transformInv(capsuleTransformCached.transform.p);
+	const PxVec3 capsuleDirInMesh = trimeshTransformCached.transform.rotateInv(tmp);
+	//Geometries : Capsule in height field local space
+	const PxVec3 p0 = capsuleCenterInMesh + tmp;
+	const PxVec3 p1 = capsuleCenterInMesh - tmp;
+
+
+	const PxReal inflatedRadius = capsuleRadius + contactDist;
+
+	const PxVec3 triNormal = currentTriangle.denormalizedNormal().getNormalized();// ((triLocV1 - triLocV0).cross(triLocV2 - triLocV0)).getNormalized();
+
+	//const PxReal d = triLocal.verts[0].dot(triNormal);
+	//const PxReal dist0 = sphereCenterInTriMesh.dot(triNormal) - d;
+	const PxReal dist = (capsuleCenterInMesh - currentTriangle.verts[0]).dot(triNormal);
+
+	PxU32 nbOutputContacts = 0;
+	PxVec3 patchNormal(0.f);
+	PxVec3 closestP;
+	//mSphereCenter will be in the local space of the triangle mesh
+
+	PxReal t, u, v;
+	PxReal sqDist = distanceSegmentTriangleSquared(p0, p1, currentTriangle.verts[0], currentTriangle.verts[1], currentTriangle.verts[2], t, u, v);
+
+
+	const PxReal sqInflatedRadius = inflatedRadius * inflatedRadius;
+	PxReal separation = PX_MAX_F32;
+	bool deferred = false;
+	//capsule overlap with triangles
+	if (dist >= 0.f && sqInflatedRadius > sqDist)
+	{
+
+		const PxU8 triFlags = computeTriangleFlags(currentTriangle, triNormal, adjTriIndices,
+			flags, heightfieldShape.scale, rows, columns, samples);
+		//printf("triangleIdx %i sqDist %f\n", triangleIdx, sqDist);
+		//printf("triangleIdx %i triNormal(%f, %f, %f)\n", triangleIdx, triNormal.x, triNormal.y, triNormal.z);
+		//printf("triangleIdx % i closestP(%f, %f, %f)\n", triangleIdx, closestP.x, closestP.y, closestP.z);
+
+
+		//printf("triangleIdx %i v0(%f, %f, %f)\n", triangleIdx, triLocal.verts[0].x, triLocal.verts[0].y, triLocal.verts[0].z);
+		//printf("triangleIdx %i v1(%f, %f, %f)\n", triangleIdx, triLocal.verts[1].x, triLocal.verts[1].y, triLocal.verts[1].z);
+		//printf("triangleIdx %i v2(%f, %f, %f)\n", triangleIdx, triLocal.verts[2].x, triLocal.verts[2].y, triLocal.verts[2].z);
+
+		//printf("remapCpuTriangleIdx %i sqInflatedSphereRadius %f sqDist %f faceContact %i\n", remapCpuTriangleIdx, sqInflatedSphereRadius, sqDist, PxU32(faceContact));
+		//patchNormal = triNormal;
+		if (selectNormal(u, v, triFlags))
+		{
+			patchNormal = triNormal;
+		}
+		else
+		{
+			if (sqDist > 0.f)
+			{
+				//segment intersect with the triangle
+				patchNormal = triNormal;
+			}
+			else
+			{
+				//edge normal
+				const PxVec3 pq = p1 - p0;
+				const PxVec3 pointOnSegment = p0 + pq * t; // V3ScaleAdd(pq, t, mCapsule.p0);
+				const PxReal w = 1.f - (u + v);// FSub(FOne(), FAdd(u, v));
+				const PxVec3 pointOnTriangle = currentTriangle.verts[0] * w + currentTriangle.verts[1] * u + currentTriangle.verts[2] * v;// V3ScaleAdd(p0, w, V3ScaleAdd(p1, u, V3Scale(p2, v)));
+				patchNormal = (pointOnSegment - pointOnTriangle).getNormalized();// V3Normalize(V3Sub(pointOnSegment, pointOnTriangle));
+				deferred = true;
+
+			}
+		}
+
+		//const PxU32 value = PxU32(faceContact);
+		//printf("triangleIdx %i faceContact %i\n", triangleIdx, value);
+		//printf("remapCpuTriangleIdx %i closestP(%f, %f, %f)\n", remapCpuTriangleIdx, closestP.x, closestP.y, closestP.z)
+
+		nbOutputContacts = generateContacts(currentTriangle.verts[0], currentTriangle.verts[1],
+			currentTriangle.verts[2], triNormal, patchNormal, p0, p1, inflatedRadius, contacts, separations);
+
+
+		generateEEContacts(currentTriangle.verts[0], currentTriangle.verts[1],
+			currentTriangle.verts[2], patchNormal, p0, p1, sqInflatedRadius, contacts, separations, nbOutputContacts);
+
+		for (PxU32 i = 0; i < nbOutputContacts; ++i)
+		{
+			//transform contact back to the capsule space
+			contacts[i] = heighfieldToCapsule.transform(contacts[i]);
+			const PxReal pen = separations[i];
+			separation = PxMin(separation, pen);
+		}
+
+	}
+
+	//printf("%i: triangleIndex %i separation %f\n", threadIdx.x, triangleIdx, separation);
+
+	SphereTriIntermediateData* PX_RESTRICT sphereTriInterm = *sphereTriIntermPtr;
+	PxU32* PX_RESTRICT orderedSphereTriInterm = *orderedSphereTriIntermPtr;
+	sphereTriInterm[sphereTriPairOffset].gpuTriIndex = triangleIdx;
+	const PxU32 deferMask = (nbOutputContacts && !deferred) ? 1 << 31 : 0;
+	orderedSphereTriInterm[sphereTriPairOffsetPadded] = deferMask | triangleIdx;
+
+
+	//PxReal* PX_RESTRICT sphereTriMaxDepth = sphereTriMaxDepthPtr[sphereTriPairOffset];
+	PxReal* PX_RESTRICT capsuleTriMaxDepth = *sphereTriMaxDepthPtr;
+	capsuleTriMaxDepth[sphereTriPairOffset] = separation;
+
+	SphereTriNormalAndIndex* PX_RESTRICT sphereTriNI = *sphereTriNIPtr;
+
+	//rotate the normal into A space 
+	sphereTriNI[sphereTriPairOffset].normal = heighfieldToCapsule.rotate(-patchNormal);
+	sphereTriNI[sphereTriPairOffset].index = (nbOutputContacts << SphereTriNormalAndIndex::NbContactsShift) + triangleIdx;
+
+	//printf("triangleIdx %i sphereTriPairOffset %i\n", triangleIdx, sphereTriPairOffset);
+
+	return nbOutputContacts;
+
+}
+
+extern "C" __global__
+void sphereHeightfieldNarrowphase(
+	const PxgContactManagerInput* PX_RESTRICT cmInputs,
+	const PxsCachedTransform* PX_RESTRICT transformCache,
+	const PxReal* PX_RESTRICT contactDistance,
+	const PxgShape* PX_RESTRICT gpuShapes,
+	ConvexMeshPair* PX_RESTRICT cvxTrimeshPair,							// per CM
+
+	ConvexTriNormalAndIndex** PX_RESTRICT cvxTriNIPtr,					// per cvx-tri
+	ConvexTriContacts** PX_RESTRICT cvxTriContactsPtr,					// per cvx-tri
+	PxReal** PX_RESTRICT cvxTriMaxDepthPtr,								// per cvx-tri
+	SphereTriIntermediateData** PX_RESTRICT cvxTriIntermPtr,			// per cvx-tri
+	PxU32** PX_RESTRICT orderedCvxTriIntermPtr,		// per cvx-tri
+	PxU32** PX_RESTRICT cvxTriSecondPassPairsPtr,						// per cvx-tri
+
+	PxU8* PX_RESTRICT stackPtr,
+	PxU32* PX_RESTRICT nbPairsGlobal,
+	PxU32* PX_RESTRICT nbPaddedPairsGlobal,
+	PxU32* PX_RESTRICT nbSecondPassPairs,
+	const PxU32 stackSizeBytes,
+	ConvexTriContact* tempConvexTriContacts,
+	PxU32* pTempContactIndex,
+	PxU32* maxTempMemRequirement,
+	PxU32* midphasePairsNeeded,
+	const PxU32 nbContactManagers
+)
+{
+	__shared__ ConvexTriNormalAndIndex* sCvxTriNIPtr;
+	__shared__ ConvexTriContacts* sCvxTriContactsPtr;
+	__shared__ PxReal*  sCvxTriMaxDepthPtr;
+	__shared__ ConvexTriIntermediateData* sCvxTriIntermPtr;
+	__shared__ PxU32* sOrderedTriIntermPtr;
+	__shared__ uint4*	sPairsGPU;
+	__shared__ PxU32*	sCvxTriSecondPassPairsPtr;
+
+	__shared__ char sShContacts[sizeof(PxVec3) * 128];
+	PxVec3* shContacts = reinterpret_cast<PxVec3*>(sShContacts);
+	__shared__ PxReal shSeparations[128];
+
+	const PxU32 nbPairs = *nbPairsGlobal;
+	const PxU32 nbPaddedPairs = *nbPaddedPairsGlobal;
+
+	//each block assign the corresponding ptr from the stack memory to sCvxTriNIPtr, sCvxTriContactPtr and sCvxTriMaxDepthPtr
+	if (threadIdx.x == 0 && threadIdx.y == 0)
+	{
+		midphaseAllocate(
+			&sCvxTriNIPtr, 
+			&sCvxTriContactsPtr, 
+			&sCvxTriMaxDepthPtr, 
+			&sCvxTriIntermPtr, 
+			&sOrderedTriIntermPtr, 
+			&sCvxTriSecondPassPairsPtr, 
+			&sPairsGPU, 
+			stackPtr, 
+			nbPairs,
+			nbPaddedPairs);
+		if (blockIdx.x == 0)
+		{
+			atomicMax(maxTempMemRequirement, calculateConvexMeshPairMemRequirement() * (*midphasePairsNeeded) + calculateAdditionalPadding(nbContactManagers));
+		}
+	}
+	__syncthreads();
+
+
+	if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0)
+	{
+		*cvxTriNIPtr = sCvxTriNIPtr;
+		*cvxTriContactsPtr = sCvxTriContactsPtr;
+		*cvxTriMaxDepthPtr = sCvxTriMaxDepthPtr;
+		*cvxTriIntermPtr = sCvxTriIntermPtr;
+		*orderedCvxTriIntermPtr = sOrderedTriIntermPtr;
+		*cvxTriSecondPassPairsPtr = sCvxTriSecondPassPairsPtr;
+	}
+
+
+	//for (PxU32 globalThreadIndex = blockIdx.x * blockDim.x + threadIdx.x; globalThreadIndex < nbPairs; globalThreadIndex += gridDim.x * blockDim.x)
+	for (PxU32 i = 0; i < nbPairs; i += gridDim.x * blockDim.x)
+	{
+		PxU32 globalThreadIndex = blockIdx.x * blockDim.x + threadIdx.x + i;
+
+		PxU32 nbContacts = 0;
+		PxU32 outMask = 0;
+		uint4 curPair;
+
+		if (globalThreadIndex < nbPairs)
+		{
+
+			// Pair
+			curPair = sPairsGPU[globalThreadIndex];
+
+			PxgContactManagerInput npWorkItem = cmInputs[curPair.x];
+			PxgShape shape0 = gpuShapes[npWorkItem.shapeRef0];
+			PxGeometryType::Enum type0 = PxGeometryType::Enum(shape0.type);
+			if (type0 == PxGeometryType::eSPHERE)
+			{
+				nbContacts = sphereHeightfieldNarrowphaseCore(
+					globalThreadIndex,
+
+					cmInputs,
+					transformCache,
+					contactDistance,
+					gpuShapes,
+					cvxTrimeshPair,
+
+					&sCvxTriNIPtr,
+					&sCvxTriContactsPtr,
+					&sCvxTriMaxDepthPtr,
+					&sCvxTriIntermPtr,
+					&sOrderedTriIntermPtr,
+					&sCvxTriSecondPassPairsPtr,
+
+					curPair,
+					npWorkItem,
+					shape0,
+					nbPairsGlobal,
+					nbSecondPassPairs,
+					shContacts[threadIdx.x * 2],
+					shSeparations[threadIdx.x * 2],
+					outMask
+				);
+			}
+			else if (type0 == PxGeometryType::eCAPSULE)
+			{
+				nbContacts = capsuleHeightfieldNarrowphaseCore(
+					globalThreadIndex,
+
+					cmInputs,
+					transformCache,
+					contactDistance,
+					gpuShapes,
+					cvxTrimeshPair,
+
+					&sCvxTriNIPtr,
+					&sCvxTriContactsPtr,
+					&sCvxTriMaxDepthPtr,
+					&sCvxTriIntermPtr,
+					&sOrderedTriIntermPtr,
+					&sCvxTriSecondPassPairsPtr,
+
+					curPair,
+					npWorkItem,
+					shape0,
+					&shContacts[threadIdx.x * 2],
+					&shSeparations[threadIdx.x * 2]
+				);
+			}
+		}
+
+		PxU32 outMaskWriteMask = __ballot_sync(FULL_MASK, outMask);
+		PxU32 count = __popc(outMaskWriteMask);
+		PxU32 startIndex = 0;
+
+		const PxU32 threadIndexInWarp = threadIdx.x & 31;
+
+		//Now output contacts!!!!
+
+		PxU32 inclusiveSum = warpScan<AddOpPxU32, PxU32>(FULL_MASK, nbContacts);
+
+		PxU32 totalContacts = __shfl_sync(FULL_MASK, inclusiveSum, 31);
+
+		if (totalContacts)
+		{
+			if (count)
+			{
+				if (threadIndexInWarp == 0)
+					startIndex = atomicAdd(nbSecondPassPairs, count);
+
+				startIndex = __shfl_sync(FULL_MASK, startIndex, 0);
+
+				if (outMask)
+				{
+					PxU32 offset = warpScanExclusive(outMaskWriteMask, threadIndexInWarp);
+					sCvxTriSecondPassPairsPtr[startIndex + offset] = outMask;
+				}
+			}
+
+			if (threadIndexInWarp == 31)
+				startIndex = atomicAdd(pTempContactIndex, inclusiveSum);
+
+			startIndex = __shfl_sync(FULL_MASK, startIndex, 31);
+
+			startIndex += inclusiveSum - nbContacts;
+
+			for (PxU32 i = 0, idx = 2 * threadIdx.x; i < nbContacts; ++i, idx++)
+			{
+				tempConvexTriContacts[startIndex + i].contact_sepW = make_float4(shContacts[idx].x, shContacts[idx].y,
+					shContacts[idx].z, shSeparations[idx]);
+			}
+
+			if (nbContacts)
+			{
+				PxU32 cmIdx = curPair.x;
+				PxU32 testOffset = curPair.z;
+
+				ConvexMeshPair& pair = cvxTrimeshPair[cmIdx];
+				PxU32 sphereTriPairOffset = pair.startIndex + testOffset;
+
+				sCvxTriContactsPtr[sphereTriPairOffset].index = startIndex;
+			}
+
+
+		}
+
+	}
+}
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/convexMesh.cu
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/convexMesh.cu
@@ -0,0 +1,975 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <assert.h>
+
+#include "foundation/PxVec3.h"
+#include "foundation/PxPlane.h"
+#include "foundation/PxTransform.h"
+#include "geometry/PxMeshScale.h"
+#include "geometry/PxGeometry.h"
+
+#include "PxgCommonDefines.h"
+#include "utils.cuh"
+#include "reduction.cuh"
+#include "sphereTriangle.cuh"
+#include "convexTriangle.cuh"
+#include "capsuleTriangle.cuh"
+#include "midphaseAllocate.cuh"
+
+#include "PxgNpKernelIndices.h"
+#include "triangleMesh.cuh"
+
+using namespace physx;
+
+extern "C" __host__ void initNarrowphaseKernels2() {}
+
+// Triangle scaling, triangle verts in shape space
+__device__ void getShapeSpaceVerts(PxVec3& v0, PxVec3& v1, PxVec3& v2, const MeshScaling& meshScale, PxU32 triangleIdx, const uint4* PX_RESTRICT trimeshTriIndices, const float4* PX_RESTRICT trimeshVerts)
+{
+	const uint4 triIndices = trimeshTriIndices[triangleIdx];
+
+	const PxU32 triVertIdx0 = triIndices.x;
+	const PxU32 triVertIdx1 = triIndices.y;
+	const PxU32 triVertIdx2 = triIndices.z;
+
+	const float4 triV0_f4 = trimeshVerts[triVertIdx0];
+	const float4 triV1_f4 = trimeshVerts[triVertIdx1];
+	const float4 triV2_f4 = trimeshVerts[triVertIdx2];
+
+	PxVec3 triV0 = PxLoad3(triV0_f4);
+	PxVec3 triV1 = PxLoad3(triV1_f4);
+	PxVec3 triV2 = PxLoad3(triV2_f4);
+
+	meshScale.getShapeSpaceVert(triV0, triV1, triV2, v0, v1, v2);
+}
+
+__device__ void convexTrimeshNarrowphaseCore(
+	PxU32 globalWarpIndex,
+	PxgShape& convexShape,
+	PxgShape& trimeshShape,
+	PxU32 transformCacheRef0,
+	PxU32 transformCacheRef1,
+	const uint4 curPair,
+	const PxsCachedTransform* PX_RESTRICT transformCache,
+	const PxReal* PX_RESTRICT contactDistance,
+	const PxgShape* PX_RESTRICT gpuShapes,
+	ConvexMeshPair* PX_RESTRICT cvxTrimeshPair,							// per CM
+	ConvexTriNormalAndIndex** PX_RESTRICT cvxTriNIPtr,					// per cvx-tri
+	ConvexTriContacts** PX_RESTRICT cvxTriContactsPtr,					// per cvx-tri
+	PxReal** PX_RESTRICT cvxTriMaxDepthPtr,								// per cvx-tri
+	ConvexTriIntermediateData** PX_RESTRICT cvxTriIntermPtr,			// per cvx-tri
+	PxU32** PX_RESTRICT orderedCvxTriIntermPtr,		// per cvx-tri
+	PxU32** PX_RESTRICT cvxTriSecondPassPairPtr,						// per cvx-tri
+	const uint4* PX_RESTRICT pairsGPU,
+	PxU32* nbPairsGlobal,
+	PxU32* nbSecondPassPairs,
+	volatile PxU32* s_WarpSharedMemory,
+	ConvexTriContact* tempConvexTriContacts,
+	const PxU32 tempContactSizeBytes,
+	PxU32* pTempContactIndex
+)
+{
+	// 1 warp is doing 1 triangle - convex pair here.
+
+	ConvexMeshScratch* s_scratch = (ConvexMeshScratch*)s_WarpSharedMemory;
+
+	PxU32 cmIdx = curPair.x;
+	PxU32 triangleIdx = curPair.y;
+	PxU32 testOffset = curPair.z;
+
+	__shared__ __align__(16) char sConvexTransformCached[sizeof(PxsCachedTransform) * NP_TRIMESH_WARPS_PER_BLOCK];
+	__shared__ __align__(16) char sTrimeshTransformCached[sizeof(PxsCachedTransform) * NP_TRIMESH_WARPS_PER_BLOCK];
+
+	PxsCachedTransform* convexTransformCached = reinterpret_cast<PxsCachedTransform*>(sConvexTransformCached);
+	PxsCachedTransform* trimeshTransformCached = reinterpret_cast<PxsCachedTransform*>(sTrimeshTransformCached);
+
+
+	if (threadIdx.x < sizeof(PxsCachedTransform) / sizeof(PxU32))
+	{
+		reinterpret_cast<PxU32*>(&convexTransformCached[threadIdx.y])[threadIdx.x] =
+			reinterpret_cast<const PxU32*>(transformCache + transformCacheRef0)[threadIdx.x];
+
+		reinterpret_cast<PxU32*>(&trimeshTransformCached[threadIdx.y])[threadIdx.x] =
+			reinterpret_cast<const PxU32*>(transformCache + transformCacheRef1)[threadIdx.x];
+	}
+
+	ConvexMeshPair& pair = cvxTrimeshPair[cmIdx];
+	PxU32 convexTriPairOffset = pair.startIndex + testOffset;
+	PxU32 convexTriPairOffsetPadded = pair.roundedStartIndex + testOffset;
+
+	__syncwarp();
+
+	if (threadIdx.x < 7)
+	{
+		reinterpret_cast<PxU32*>(&s_scratch->convexScale)[threadIdx.x] = reinterpret_cast<PxU32*>(&convexShape.scale)[threadIdx.x];
+	}
+
+
+	if (threadIdx.x == 0)
+	{
+		// convexTransform/trimeshTransform already include bodyTM/relTM
+		s_scratch->trimeshToConvexTransform = convexTransformCached[threadIdx.y].transform.transformInv(trimeshTransformCached[threadIdx.y].transform);
+		// Shapes
+		//s_scratch->contactDist = convexShape.contactOffset + trimeshShape.contactOffset;
+		s_scratch->contactDist = contactDistance[transformCacheRef0] + contactDistance[transformCacheRef1];
+
+		const PxU8* convexPtrA = reinterpret_cast<const PxU8 *>(convexShape.hullOrMeshPtr);
+		s_scratch->convexPtrA = convexPtrA;
+
+		const float4 hull0_centerOfMass_f4 = *reinterpret_cast<const float4 *>(convexPtrA);
+		const PxVec3 hull0_centerOfMass(hull0_centerOfMass_f4.x, hull0_centerOfMass_f4.y, hull0_centerOfMass_f4.z);
+		// Transform CoM into shape space
+		PxVec3 shapeSpaceCenterOfMass0 = vertex2Shape(hull0_centerOfMass, convexShape.scale.scale, convexShape.scale.rotation);
+
+		s_scratch->convexCenterOfMass = shapeSpaceCenterOfMass0;
+
+		convexPtrA += sizeof(float4);
+
+		const uint4 tmp = *((uint4*)convexPtrA);
+		const PxU32 polyData0_NbEdgesNbHullVerticesNbPolygons = tmp.x;
+		s_scratch->nbEdgesNbHullVerticesNbPolygons = polyData0_NbEdgesNbHullVerticesNbPolygons;
+
+		convexPtrA += sizeof(uint4);
+
+		// Geometries : Triangle Mesh
+		const PxU8 * trimeshGeomPtr = reinterpret_cast<const PxU8 *>(trimeshShape.hullOrMeshPtr);
+
+		const uint4* PX_RESTRICT trimeshTriAdjacencies;
+		readTriangleMesh(trimeshGeomPtr, s_scratch->trimeshVerts, s_scratch->trimeshTriIndices, trimeshTriAdjacencies, s_scratch->trimeshFaceRemap);
+		
+
+		// TODO: shuffle this across threads (reading uint4 seq)
+		// later shuffle adjacency work and shuffle back adjNormals
+		uint4 triAdjTriIndices = trimeshTriAdjacencies[triangleIdx];
+
+		s_scratch->triAdjTrisIdx[0] = triAdjTriIndices.x;
+		s_scratch->triAdjTrisIdx[1] = triAdjTriIndices.y;
+		s_scratch->triAdjTrisIdx[2] = triAdjTriIndices.z;
+	}
+
+	__syncwarp();
+
+	if (threadIdx.x < 3)
+	{
+		const PxMeshScale& scale = trimeshShape.scale;
+		bool flipNormal = ((scale.scale.x * scale.scale.y * scale.scale.z) < 0.0f);
+		PxU32 writeIndex = threadIdx.x;
+		if (threadIdx.x > 0 && flipNormal)
+			writeIndex = threadIdx.x ^ 3;
+
+		const PxU32 triVertIdx = reinterpret_cast<const PxU32*>(&s_scratch->trimeshTriIndices[triangleIdx])[threadIdx.x];
+
+		const float4 triV_f4 = s_scratch->trimeshVerts[triVertIdx];
+
+		PxVec3 triV = PxLoad3(triV_f4);
+
+		triV = vertex2Shape(triV, trimeshShape.scale.scale, trimeshShape.scale.rotation);
+
+		PxVec3 triLocVerts = s_scratch->trimeshToConvexTransform.transform(triV);
+
+		s_scratch->triLocVerts[writeIndex] = triLocVerts;
+	}
+	__syncwarp();
+	if (threadIdx.x == 0)
+	{
+		PxVec3 triangleLocNormal;
+		triangleLocNormal = (s_scratch->triLocVerts[1] - s_scratch->triLocVerts[0]).cross(s_scratch->triLocVerts[2] - s_scratch->triLocVerts[0]).getNormalized();
+
+		s_scratch->triangleLocNormal = triangleLocNormal;
+	}
+	__syncwarp();
+
+	const PxU32 remapCpuTriangleIdx = s_scratch->trimeshFaceRemap[triangleIdx];
+	convexTriangleContactGen(
+		s_WarpSharedMemory, s_scratch, convexTriPairOffset, convexTriPairOffsetPadded, remapCpuTriangleIdx, triangleIdx, globalWarpIndex,
+		cvxTriNIPtr, cvxTriContactsPtr, cvxTriMaxDepthPtr, cvxTriIntermPtr, orderedCvxTriIntermPtr, cvxTriSecondPassPairPtr, nbSecondPassPairs,
+		tempConvexTriContacts, tempContactSizeBytes/sizeof(ConvexTriContact), pTempContactIndex);
+}
+
+
+
+__device__ PxU32 sphereTrimeshNarrowphaseCore(
+	PxU32 globalThreadIndex,
+	const PxgContactManagerInput& npWorkItem,
+	const PxgShape& sphereShape,
+	const PxgShape& trimeshShape,
+	const uint4 curPair,
+	const PxsCachedTransform* PX_RESTRICT transformCache,
+	const PxReal* PX_RESTRICT contactDistance,
+	SphereMeshPair* PX_RESTRICT sphereTrimeshPair,					// per CM
+	SphereTriNormalAndIndex** PX_RESTRICT sphereTriNIPtr,			// per sphere-tri
+	SphereTriContacts** PX_RESTRICT sphereTriContactsPtr,			// per sphere-tri
+	PxReal** PX_RESTRICT sphereTriMaxDepthPtr,						// per sphere-tri
+	SphereTriIntermediateData** PX_RESTRICT sphereTriIntermPtr,		// per sphere-tri
+	PxU32** PX_RESTRICT orderedSphereTriIntermPtr,					// per sphere-tri
+	PxU32** PX_RESTRICT sphereTriSecondPassPairPtr,					// per sphere-tri
+	PxU32* nbPairsGlobal,
+	PxU32* nbSecondPassPairs,
+	PxVec3& outContact,
+	PxReal& outSep,
+	PxU32& outMask 
+)
+{
+	// Pair, stackPtr in convexMeshMidphase kernel
+	PxU32 cmIdx = curPair.x;
+	PxU32 triangleIdx = curPair.y;
+	PxU32 testOffset = curPair.z;
+
+	PxsCachedTransform sphereTransformCached = transformCache[npWorkItem.transformCacheRef0];
+	PxsCachedTransform trimeshTransformCached = transformCache[npWorkItem.transformCacheRef1];
+
+	SphereMeshPair& pair = sphereTrimeshPair[cmIdx];
+	PxU32 sphereTriPairOffset = pair.startIndex + testOffset;
+	PxU32 sphereTriPairOffsetPadded = pair.roundedStartIndex + testOffset;
+
+
+	// Shapes
+	const PxReal contactDist = contactDistance[npWorkItem.transformCacheRef0] + contactDistance[npWorkItem.transformCacheRef1];
+	const PxQuat trimeshScaleRot = trimeshShape.scale.rotation;
+
+	const PxReal sphereRadius = sphereShape.scale.scale.x;
+	const PxVec3 trimeshScale = trimeshShape.scale.scale;
+
+	//Geometries : Sphere
+	const PxVec3 sphereCenterInTriMesh = trimeshTransformCached.transform.transformInv(sphereTransformCached.transform.p);
+
+	// Geometries : Triangle Mesh
+	const PxU8 * trimeshGeomPtr = reinterpret_cast<const PxU8 *>(trimeshShape.hullOrMeshPtr);
+
+	const float4 * trimeshVerts;
+	const uint4 * trimeshTriIndices;
+	const uint4* PX_RESTRICT trimeshTriAdjacencies;
+	const PxU32* PX_RESTRICT trimeshFaceRemap;
+	readTriangleMesh(trimeshGeomPtr, trimeshVerts, trimeshTriIndices, trimeshTriAdjacencies, trimeshFaceRemap);
+
+
+	const PxMeshScale& scale = trimeshShape.scale;
+	MeshScaling meshScale(scale.scale, scale.rotation);
+
+	PxVec3 v0, v1, v2;
+	getShapeSpaceVerts(v0, v1, v2, meshScale, triangleIdx, trimeshTriIndices, trimeshVerts);
+
+	const PxVec3 n = (v1 - v0).cross(v2 - v0).getNormalized();
+
+
+	uint4 triAdjTriIndices = trimeshTriAdjacencies[triangleIdx];
+	if (meshScale.flipNormal)
+		PxSwap(triAdjTriIndices.x, triAdjTriIndices.z);
+
+
+	const PxU32 remapCpuTriangleIdx = trimeshFaceRemap[triangleIdx];
+
+	const PxReal inflatedRadius = sphereRadius + contactDist;
+
+	const PxReal d = v0.dot(n);
+	const PxReal dist0 = sphereCenterInTriMesh.dot(n) - d;
+
+
+	PxReal separation = PX_MAX_F32;
+
+	PxU32 nbOutputContacts = 0;
+	bool delayContacts = false;
+	bool edgeContact = false;
+	PxVec3 patchNormal(0.f);
+
+	if (dist0 >= 0)
+	{
+		PxVec3 closestP;
+		//mSphereCenter will be in the local space of the triangle mesh
+		bool generateContact = false;
+		bool faceContact = false;
+		
+		PxU32 mask;
+		const PxReal sqDist = distancePointTriangleSquared(sphereCenterInTriMesh, v0, v1, v2, triAdjTriIndices, closestP, generateContact, faceContact, &mask);
+
+		const PxReal sqInflatedSphereRadius = inflatedRadius * inflatedRadius;
+		//sphere overlap with triangles
+		if (sqInflatedSphereRadius > sqDist && generateContact)
+		{
+			//printf("remapCpuTriangleIdx %i sqInflatedSphereRadius %f sqDist %f faceContact %i\n", remapCpuTriangleIdx, sqInflatedSphereRadius, sqDist, PxU32(faceContact));
+			patchNormal = n;
+			if (!faceContact)
+				patchNormal = (sphereCenterInTriMesh - closestP).getNormalized();
+
+			//printf("remapCpuTriangleIdx %i closestP(%f, %f, %f)\n", remapCpuTriangleIdx, closestP.x, closestP.y, closestP.z);
+
+			const PxReal cosTheta = patchNormal.dot(n);
+			const PxReal tolerance = 0.996f;//around 5 degree
+
+			//two normal's projection less than 5 degree, generate contacts
+			delayContacts = cosTheta <= tolerance;
+			if (delayContacts)
+			{
+				//delay contacts
+				//PxU32* PX_RESTRICT sphereTriSecondPairPass = *sphereTriSecondPassPairPtr;
+				//const PxU32 startIndex = atomicAdd(nbSecondPassPairs, 1);
+
+				// PT: no need to compute the mask, we got it for free from distancePointTriangleSquared
+				//PxU32 mask = getTriangleActiveMask(closestP, v0, v1, v2);
+
+				//sphereTriSecondPairPass[startIndex] = mask | globalThreadIndex;
+				outMask = mask | globalThreadIndex;
+
+				// VR: is an edge contact?
+				edgeContact = ConvexTriIntermediateData::isEdgeContact(mask);
+			}
+
+			nbOutputContacts = 1;
+			separation = PxSqrt(sqDist);
+
+
+			// Generate contacts
+			//SphereTriContacts* PX_RESTRICT sphereTriContacts = *sphereTriContactsPtr;
+
+			//float4* contactBuf = reinterpret_cast<float4 *>(sphereTriContacts + sphereTriPairOffset);
+			//sphere center in sphere local space
+			//contactBuf[0] = make_float4(0.f, 0.f, 0.f, separation);
+
+			//normal = patchNormal;
+
+			outSep = separation;
+
+			outContact = PxVec3(0.f);
+
+
+		}
+	}
+
+	PxReal* PX_RESTRICT sphereTriMaxDepth = *sphereTriMaxDepthPtr;
+	sphereTriMaxDepth[sphereTriPairOffset] = separation;
+
+	SphereTriIntermediateData* PX_RESTRICT sphereTriInterm = *sphereTriIntermPtr;
+	PxU32* PX_RESTRICT orderedSphereTriInterm = *orderedSphereTriIntermPtr;
+	sphereTriInterm[sphereTriPairOffset].gpuTriIndex = triangleIdx;
+
+	// VR: bits 30 and 31 are used for the search flags
+	assert(triangleIdx < (1 << 30));
+
+	PxU32 orderedTriangleIdx = triangleIdx;
+	if (nbOutputContacts)
+		if (!delayContacts) // not dalayed -> face contact
+			orderedTriangleIdx |= TriangleSearchFlags::eFACE_CONTACT;
+		else if (edgeContact)
+			orderedTriangleIdx |= TriangleSearchFlags::eEDGE_CONTACT;
+
+	orderedSphereTriInterm[sphereTriPairOffsetPadded] = orderedTriangleIdx;
+
+	SphereTriNormalAndIndex* PX_RESTRICT sphereTriNI = *sphereTriNIPtr;
+
+	PxU32 delayContactMask = delayContacts ? SphereTriNormalAndIndex::DeferredContactMask : 0;
+
+	//rotate the normal into A space 
+	PxVec3 worldNormal = trimeshTransformCached.transform.rotate(-patchNormal);
+	sphereTriNI[sphereTriPairOffset].normal = sphereTransformCached.transform.rotateInv(worldNormal);
+	sphereTriNI[sphereTriPairOffset].index = delayContactMask + (nbOutputContacts << SphereTriNormalAndIndex::NbContactsShift) + remapCpuTriangleIdx;
+
+	//printf("sphereTriPairOffset %i nbOutputContacts%i \n", sphereTriPairOffset, nbOutputContacts);
+
+	return nbOutputContacts;
+}
+
+
+__device__ PxU32 capsuleTrimeshNarrowphaseCore(
+	PxU32 globalThreadIndex,
+	const PxgContactManagerInput& npWorkItem,
+	const PxgShape& capsuleShape,
+	const PxgShape& trimeshShape,
+	const uint4 curPair,
+	const PxsCachedTransform* PX_RESTRICT transformCache,
+	const PxReal* PX_RESTRICT contactDistance,
+	SphereMeshPair* PX_RESTRICT sphereTrimeshPair,					// per CM
+	SphereTriNormalAndIndex** PX_RESTRICT sphereTriNIPtr,			// per sphere-tri
+	SphereTriContacts** PX_RESTRICT sphereTriContactsPtr,			// per sphere-tri
+	PxReal** PX_RESTRICT sphereTriMaxDepthPtr,						// per sphere-tri
+	SphereTriIntermediateData** PX_RESTRICT sphereTriIntermPtr,		// per sphere-tri
+	PxU32** PX_RESTRICT orderedSphereTriIntermPtr,					// per sphere-tri
+	PxU32** PX_RESTRICT sphereTriSecondPassPairPtr,					// per sphere-tri
+	PxU32* nbSecondPassPairs,
+	PxVec3* contacts,
+	PxReal* separations,
+	PxU32& outMask
+)
+{
+	// Pair, stackPtr in convexMeshMidphase kernel
+	PxU32 cmIdx = curPair.x;
+	PxU32 triangleIdx = curPair.y;
+	PxU32 testOffset = curPair.z;
+
+	PxsCachedTransform capsuleTransformCached = transformCache[npWorkItem.transformCacheRef0];
+	PxsCachedTransform trimeshTransformCached = transformCache[npWorkItem.transformCacheRef1];
+
+	SphereMeshPair& pair = sphereTrimeshPair[cmIdx];
+	PxU32 sphereTriPairOffset = pair.startIndex + testOffset;
+	PxU32 sphereTriPairOffsetPadded = pair.roundedStartIndex + testOffset;
+
+
+	// Shapes
+	const PxReal contactDist = contactDistance[npWorkItem.transformCacheRef0] + contactDistance[npWorkItem.transformCacheRef1];
+	const PxQuat trimeshScaleRot = trimeshShape.scale.rotation;
+
+	const PxReal capsuleRadius = capsuleShape.scale.scale.y;
+	const PxReal capsuleHalfHeight = capsuleShape.scale.scale.x;
+	const PxVec3 trimeshScale = trimeshShape.scale.scale;
+
+	//Geometries : Capsule
+	const PxTransform meshToCapsule = capsuleTransformCached.transform.transformInv(trimeshTransformCached.transform);
+
+	//Capsule in triangle mesh space
+	const PxVec3 tmp = capsuleTransformCached.transform.q.getBasisVector0() * capsuleHalfHeight;
+	const PxVec3 capsuleCenterInMesh = trimeshTransformCached.transform.transformInv(capsuleTransformCached.transform.p);
+	const PxVec3 capsuleDirInMesh = trimeshTransformCached.transform.rotateInv(tmp);
+	//Geometries : Capsule in mesh local space
+	const PxVec3 p0 = capsuleCenterInMesh + capsuleDirInMesh;
+	const PxVec3 p1 = capsuleCenterInMesh - capsuleDirInMesh;
+
+	// Geometries : Triangle Mesh
+	const PxU8 * trimeshGeomPtr = reinterpret_cast<const PxU8 *>(trimeshShape.hullOrMeshPtr);
+
+	const float4 * trimeshVerts;
+	const uint4 * trimeshTriIndices;
+	const uint4* PX_RESTRICT trimeshTriAdjacencies;
+	readTriangleMesh(trimeshGeomPtr, trimeshVerts, trimeshTriIndices, trimeshTriAdjacencies);
+
+
+	const PxMeshScale& scale = trimeshShape.scale;
+	MeshScaling meshScale(scale.scale, scale.rotation);
+
+	PxVec3 v0, v1, v2;
+	getShapeSpaceVerts(v0, v1, v2, meshScale, triangleIdx, trimeshTriIndices, trimeshVerts);
+
+	const PxVec3 triNormal = (v1 - v0).cross(v2 - v0).getNormalized();
+
+	uint4 triAdjTrisIdx = trimeshTriAdjacencies[triangleIdx];
+	if (meshScale.flipNormal)
+		PxSwap(triAdjTrisIdx.x, triAdjTrisIdx.z);
+
+	const PxReal inflatedRadius = capsuleRadius + contactDist;
+
+	//const PxReal d = v0.dot(n);
+	//const PxReal dist0 = sphereCenterInTriMesh.dot(n) - d;
+
+	const PxReal dist = (capsuleCenterInMesh - v0).dot(triNormal);
+
+	PxReal separation = PX_MAX_F32;
+	PxU32 nbOutputContacts = 0;
+	PxVec3 patchNormal(0.f);
+	bool delayContacts = false;
+	bool edgeContact = false;
+
+	if (dist >= 0)
+	{
+		//PxVec3 closestP;
+
+		PxReal t, u, v;
+		PxReal sqDist = distanceSegmentTriangleSquared(p0, p1, v0, v1, v2, t, u, v);
+
+		const PxReal sqInflatedRadius = inflatedRadius * inflatedRadius;
+
+		//capsule overlap with triangles
+		
+		if (sqInflatedRadius > sqDist)
+		{
+// VR: I made this similar to sphere-trimesh
+// and I left the old variant for the reference
+#if 1
+			if (sqDist < FLT_EPSILON)
+			{
+				//segment intersect with the triangle
+				patchNormal = triNormal;
+			}
+			else
+			{
+				const PxVec3 pointOnSegment = p0 + (p1 - p0) * t;
+				const PxVec3 pointOnTriangle = v0 * (1.f - u - v) + v1 * u + v2 * v;
+				patchNormal = (pointOnSegment - pointOnTriangle).getNormalized();
+
+				const PxReal cosTheta = patchNormal.dot(triNormal);
+				const PxReal tolerance = 0.996f;//around 5 degree
+
+				//printf("pointOnSegment: (%g %g %g); pointOnTriangle: (%g %g %g); cosTheta: %g\n", pointOnSegment.x, pointOnSegment.y, pointOnSegment.z, pointOnTriangle.x, pointOnTriangle.y, pointOnTriangle.z, cosTheta);
+
+				//two normal's projection less than 5 degree, generate contacts
+				delayContacts = cosTheta <= tolerance;
+			}
+
+			if (delayContacts)
+			{
+				PxU32 mask = getTriangleActiveMaskFromNormal(v0, v1, v2, patchNormal);
+				outMask = mask | globalThreadIndex;
+
+				//const PxVec3 pn = patchNormal;
+				//printf("pn: (%g %g %g); mask: %x\n", pn.x, pn.y, pn.z, mask);
+
+				// is an edge contact?
+				edgeContact = ConvexTriIntermediateData::isEdgeContact(mask);
+			}
+#else
+			PxU32 triFlags = 0;
+
+			triFlags |= (triAdjTrisIdx.x != BOUNDARY && (triAdjTrisIdx.x & NONCONVEX_FLAG) ? 0 : 1 << 3);
+			triFlags |= (triAdjTrisIdx.y != BOUNDARY && (triAdjTrisIdx.y & NONCONVEX_FLAG) ? 0 : 1 << 4);
+			triFlags |= (triAdjTrisIdx.z != BOUNDARY && (triAdjTrisIdx.z & NONCONVEX_FLAG) ? 0 : 1 << 5);
+
+			if (selectNormal(u, v, triFlags))
+			{
+				patchNormal = triNormal;
+			}
+			else
+			{
+				if (sqDist < 1e-6f)
+				{
+					//segment intersect with the triangle
+					patchNormal = triNormal;
+				}
+				else
+				{
+					//edge normal
+					const PxVec3 pq = p1 - p0;
+					const PxVec3 pointOnSegment = p0 + pq * t; // V3ScaleAdd(pq, t, mCapsule.p0);
+					const PxReal w = 1.f - (u + v);// FSub(FOne(), FAdd(u, v));
+					const PxVec3 pointOnTriangle = v0 * w + v1 * u + v2 * v;// V3ScaleAdd(p0, w, V3ScaleAdd(p1, u, V3Scale(p2, v)));
+					patchNormal = (pointOnSegment - pointOnTriangle).getNormalized();// V3Normalize(V3Sub(pointOnSegment, pointOnTriangle));
+					delayContacts = true;
+
+				}
+			}
+#endif
+
+			nbOutputContacts = generateContacts(v0, v1, v2, triNormal, patchNormal, p0, p1, inflatedRadius, contacts, separations);
+			
+			generateEEContacts(v0, v1, v2, patchNormal, p0, p1, sqInflatedRadius, contacts, separations, nbOutputContacts);
+
+			for (PxU32 i = 0; i < nbOutputContacts; ++i)
+			{
+				//transform contact back to the capsule space
+				contacts[i] = meshToCapsule.transform(contacts[i]);
+				const PxReal pen = separations[i];
+				separation = PxMin(separation, pen);
+			}
+		}
+	}
+
+	SphereTriIntermediateData* PX_RESTRICT sphereTriInterm = *sphereTriIntermPtr;
+	PxU32* PX_RESTRICT orderedSphereTriInterm = *orderedSphereTriIntermPtr;
+	sphereTriInterm[sphereTriPairOffset].gpuTriIndex = triangleIdx;
+
+	//if (nbOutputContacts)
+	//	printf("triangleIdx: %d; nbOutputContacts: %d; delayContacts %d; edgeContact: %d\n", triangleIdx, nbOutputContacts, delayContacts, edgeContact);
+
+	// VR: bits 30 and 31 are used for the search flags
+	assert(triangleIdx < (1 << 30));
+
+	PxU32 orderedTriangleIdx = triangleIdx;
+	if (nbOutputContacts)
+		if (!delayContacts) // not dalayed -> face contact
+			orderedTriangleIdx |= TriangleSearchFlags::eFACE_CONTACT;
+		else if (edgeContact)
+			orderedTriangleIdx |= TriangleSearchFlags::eEDGE_CONTACT;
+
+	orderedSphereTriInterm[sphereTriPairOffsetPadded] = orderedTriangleIdx;
+
+	/*sphereTriInterm[sphereTriPairOffset].triAdjTrisIds[0] = triAdjTrisIdx.x;
+	sphereTriInterm[sphereTriPairOffset].triAdjTrisIds[1] = triAdjTrisIdx.y;
+	sphereTriInterm[sphereTriPairOffset].triAdjTrisIds[2] = triAdjTrisIdx.z;*/
+
+	//PxReal* PX_RESTRICT sphereTriMaxDepth = sphereTriMaxDepthPtr[sphereTriPairOffset];
+	PxReal* PX_RESTRICT capsuleTriMaxDepth = *sphereTriMaxDepthPtr;
+	capsuleTriMaxDepth[sphereTriPairOffset] = separation;
+
+	SphereTriNormalAndIndex* PX_RESTRICT sphereTriNI = *sphereTriNIPtr;
+
+	PxU32 delayContactMask = delayContacts ? SphereTriNormalAndIndex::DeferredContactMask : 0;
+
+	//rotate the normal into A space 
+	sphereTriNI[sphereTriPairOffset].normal = meshToCapsule.rotate(-patchNormal);
+	sphereTriNI[sphereTriPairOffset].index = delayContactMask + (nbOutputContacts << SphereTriNormalAndIndex::NbContactsShift) + triangleIdx;
+
+	////printf("sphereTriPairOffset %i nbOutputContacts%i \n", sphereTriPairOffset, nbOutputContacts);
+
+	return nbOutputContacts;
+}
+
+
+extern "C" __global__
+//__launch_bounds__(NP_TRIMESH_WARPS_PER_BLOCK * WARP_SIZE, 32 / NP_TRIMESH_WARPS_PER_BLOCK)
+void convexTrimeshNarrowphase(
+	const PxgContactManagerInput* PX_RESTRICT cmInputs,
+	const PxsCachedTransform* PX_RESTRICT transformCache,
+	const PxReal* PX_RESTRICT contactDistance,
+	const PxgShape* PX_RESTRICT gpuShapes,
+	
+	ConvexMeshPair* PX_RESTRICT cvxTrimeshPair,						// per CM
+	
+	ConvexTriNormalAndIndex** PX_RESTRICT cvxTriNIPtr,				// per cvx-tri
+	ConvexTriContacts** PX_RESTRICT cvxTriContactsPtr,				// per cvx-tri
+	PxReal** PX_RESTRICT cvxTriMaxDepthPtr,							// per cvx-tri
+	ConvexTriIntermediateData** PX_RESTRICT cvxTriIntermPtr,		// per cvx-tri
+	PxU32** PX_RESTRICT orderedCvxTriIntermPtr,	// per cvx-tri
+	PxU32** PX_RESTRICT cvxTriSecondPassPairsPtr,					// per cvx-tri
+	
+	PxU8* PX_RESTRICT stackPtr,
+	PxU32* PX_RESTRICT nbPairsGlobal,
+	PxU32* PX_RESTRICT nbPaddedPairsGlobal,
+	PxU32* PX_RESTRICT nbSecondPassPairs,
+	const PxU32 stackSizeBytes,
+	ConvexTriContact* tempConvexTriContacts,
+	PxU32* pTempContactIndex,
+	PxU32* maxTempMemRequirement,
+	PxU32* midPhasePairsNeeded,
+	const PxU32 nbContactManagers
+)
+{
+	__shared__ ConvexTriNormalAndIndex* sCvxTriNIPtr;
+	__shared__ ConvexTriContacts* sCvxTriContactsPtr;
+	__shared__ PxReal*  sCvxTriMaxDepthPtr;
+	__shared__ ConvexTriIntermediateData* sCvxTriIntermPtr;
+	__shared__ PxU32* sOrderedCvxTriIntermPtr;
+	__shared__ uint4*	sPairsGPU;
+	__shared__ PxU32*	sCvxTriSecondPassPairsPtr;
+
+	const PxU32 warpsPerBlock = NP_TRIMESH_WARPS_PER_BLOCK;
+
+	__shared__ volatile PxU32 sharedMem[warpsPerBlock][WARP_SIZE * 16];
+	volatile PxU32* s_WarpSharedMemory = sharedMem[threadIdx.y];
+
+	// we don't need to check overflow here anymore, the midphase checks it.
+	const PxU32 nbPairs = *nbPairsGlobal;
+	const PxU32 nbPaddedPairs = *nbPaddedPairsGlobal;
+
+	//each block assign the corresponding ptr from the stack memory to sCvxTriNIPtr, sCvxTriContactPtr and sCvxTriMaxDepthPtr
+	if (threadIdx.x == 0 && threadIdx.y == 0)
+	{
+		midphaseAllocate(&sCvxTriNIPtr, &sCvxTriContactsPtr, &sCvxTriMaxDepthPtr, &sCvxTriIntermPtr, &sOrderedCvxTriIntermPtr, &sCvxTriSecondPassPairsPtr, &sPairsGPU, stackPtr, nbPairs, nbPaddedPairs);
+		if (blockIdx.x == 0)
+		{
+			atomicMax(maxTempMemRequirement, calculateConvexMeshPairMemRequirement() * (*midPhasePairsNeeded) + calculateAdditionalPadding(nbContactManagers));
+		}
+	}
+	__syncthreads();
+
+	// the first block writes back the pointers to global memory.
+	if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0)
+	{
+		*cvxTriNIPtr = sCvxTriNIPtr;
+		*cvxTriContactsPtr = sCvxTriContactsPtr;
+		*cvxTriMaxDepthPtr = sCvxTriMaxDepthPtr;
+		*cvxTriIntermPtr = sCvxTriIntermPtr;
+		*orderedCvxTriIntermPtr = sOrderedCvxTriIntermPtr;
+
+		*cvxTriSecondPassPairsPtr = sCvxTriSecondPassPairsPtr;
+	}
+
+	// 1 warp deals with 1 convexMeshPair - so 1 convex against 1 mesh triangle.
+	for (PxU32 globalWarpIndex = blockIdx.x * blockDim.y + threadIdx.y; globalWarpIndex < nbPairs; globalWarpIndex += gridDim.x * blockDim.y)
+	{
+		uint4 curPair = sPairsGPU[globalWarpIndex];
+		const PxU32 cmIdx = curPair.x;
+
+		PxgContactManagerInput npWorkItem;
+		PxgContactManagerInput_ReadWarp(npWorkItem, cmInputs, cmIdx);
+
+		PxU32 transformCacheRef0 = npWorkItem.transformCacheRef0;
+		PxU32 transformCacheRef1 = npWorkItem.transformCacheRef1;
+		PxU32 shapeRef0 = npWorkItem.shapeRef0;
+		PxU32 shapeRef1 = npWorkItem.shapeRef1;
+
+		__shared__ __align__(16) char sShape0[sizeof(PxgShape) * NP_TRIMESH_WARPS_PER_BLOCK];
+		__shared__ __align__(16) char sShape1[sizeof(PxgShape) * NP_TRIMESH_WARPS_PER_BLOCK];
+
+		PxgShape* shape0 = reinterpret_cast<PxgShape*>(sShape0);
+		PxgShape* shape1 = reinterpret_cast<PxgShape*>(sShape1);
+
+		if (threadIdx.x < sizeof(PxgShape) / sizeof(PxU32))
+		{
+			reinterpret_cast<PxU32*>(&shape0[threadIdx.y])[threadIdx.x] = reinterpret_cast<const PxU32*>(gpuShapes + shapeRef0)[threadIdx.x];
+			reinterpret_cast<PxU32*>(&shape1[threadIdx.y])[threadIdx.x] = reinterpret_cast<const PxU32*>(gpuShapes + shapeRef1)[threadIdx.x];
+		}
+
+		PxgShape* shape = &shape0[threadIdx.y];
+		PxgShape* trimeshShape = &shape1[threadIdx.y];
+
+		__syncwarp();
+
+		bool flip = shape0[threadIdx.y].type == PxGeometryType::eTRIANGLEMESH;
+
+		if (flip)
+		{
+			PxSwap(transformCacheRef0, transformCacheRef1);
+			PxSwap(shapeRef0, shapeRef1);
+
+			shape = &shape1[threadIdx.y];
+			trimeshShape = &shape0[threadIdx.y];
+		}
+
+		convexTrimeshNarrowphaseCore(
+			globalWarpIndex,
+			*shape,
+			*trimeshShape,
+			transformCacheRef0,
+			transformCacheRef1,
+			curPair,
+			transformCache,
+			contactDistance,
+			gpuShapes,
+
+			cvxTrimeshPair,
+			&sCvxTriNIPtr,
+			&sCvxTriContactsPtr,
+			&sCvxTriMaxDepthPtr,
+			&sCvxTriIntermPtr,
+			&sOrderedCvxTriIntermPtr,
+			&sCvxTriSecondPassPairsPtr,
+
+			sPairsGPU,
+			nbPairsGlobal,
+			nbSecondPassPairs,
+			s_WarpSharedMemory,
+			tempConvexTriContacts,
+			stackSizeBytes,
+			pTempContactIndex
+		);
+	}
+}
+
+extern "C" __global__
+void sphereTrimeshNarrowphase(
+	const PxgContactManagerInput* PX_RESTRICT cmInputs,
+	const PxsCachedTransform* PX_RESTRICT transformCache,
+	const PxReal* PX_RESTRICT contactDistance,
+	const PxgShape* PX_RESTRICT gpuShapes,
+
+	SphereMeshPair* PX_RESTRICT sphereTrimeshPair,						// per CM
+
+	SphereTriNormalAndIndex** PX_RESTRICT sphereTriNIPtr,				// per sphere-tri
+	SphereTriContacts** PX_RESTRICT sphereTriContactsPtr,				// per sphere-tri
+	PxReal** PX_RESTRICT sphereTriMaxDepthPtr,							// per sphere-tri
+	SphereTriIntermediateData** PX_RESTRICT sphereTriIntermPtr,			// per sphere-tri
+	PxU32** PX_RESTRICT orderedSphereTriIntermPtr,	// per sphere-tri
+	PxU32** PX_RESTRICT sphereTriSecondPassPairsPtr,					// per sphere-tri
+
+	PxU8* PX_RESTRICT stackPtr,
+	PxU32* PX_RESTRICT nbPairsGlobal,
+	PxU32* PX_RESTRICT nbPaddedPairsGlobal,
+	PxU32* PX_RESTRICT nbSecondPassPairs,
+	const PxU32 stackSizeBytes,
+	ConvexTriContact* tempConvexTriContacts,
+	PxU32* pTempContactIndex,
+	PxU32* maxTempMemRequirement,
+	PxU32* midphasePairsNeeded,
+	const PxU32 nbContactManagers
+)
+{
+
+	//the pointer is shared memory. However, the memory we are writing to is in global memory
+	__shared__ SphereTriNormalAndIndex* sSphereTriNIPtr;
+	__shared__ SphereTriContacts* sSphereTriContactsPtr;
+	__shared__ PxReal*  sSphereTriMaxDepthPtr;
+	__shared__ SphereTriIntermediateData* sSphereTriIntermPtr;
+	__shared__ PxU32* sOrderedSphereTriIntermPtr;
+	__shared__ uint4*	sPairsGPU;
+	__shared__ PxU32*	sSphereTriSecondPassPairsPtr;
+
+	__shared__ char sContacts[sizeof(PxVec3) * 128];
+	PxVec3* shContacts = reinterpret_cast<PxVec3*>(sContacts);
+	__shared__ PxReal shSeparations[128];
+
+	const PxU32 nbPairs = *nbPairsGlobal;
+	const PxU32 nbPaddedPairs = *nbPaddedPairsGlobal;
+
+	if (threadIdx.x == 0 && threadIdx.y == 0)
+	{
+		midphaseAllocate(&sSphereTriNIPtr, &sSphereTriContactsPtr, &sSphereTriMaxDepthPtr, &sSphereTriIntermPtr, &sOrderedSphereTriIntermPtr, &sSphereTriSecondPassPairsPtr, &sPairsGPU, stackPtr, nbPairs, nbPaddedPairs);
+		if (blockIdx.x == 0)
+		{
+			atomicMax(maxTempMemRequirement, calculateConvexMeshPairMemRequirement() * (*midphasePairsNeeded) + calculateAdditionalPadding(nbContactManagers));
+		}
+	}
+
+	__syncthreads();
+
+
+	if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0)
+	{
+		*sphereTriNIPtr = sSphereTriNIPtr;
+		*sphereTriContactsPtr = sSphereTriContactsPtr;
+		*sphereTriMaxDepthPtr = sSphereTriMaxDepthPtr;
+		*sphereTriIntermPtr = sSphereTriIntermPtr;
+		*orderedSphereTriIntermPtr = sOrderedSphereTriIntermPtr;
+		*sphereTriSecondPassPairsPtr = sSphereTriSecondPassPairsPtr;
+	}
+
+	//each thread deal with 
+	//for (PxU32 globalThreadIndex = blockIdx.x * blockDim.x + threadIdx.x; globalThreadIndex < nbPairs; globalThreadIndex += gridDim.x * blockDim.x)
+	for (PxU32 i = 0; i < nbPairs; i += gridDim.x * blockDim.x)
+	{
+		PxU32 globalThreadIndex = blockIdx.x * blockDim.x + threadIdx.x + i;
+
+		PxU32 nbContacts = 0;
+		PxU32 outMask = 0;
+		uint4 curPair;
+
+		if (globalThreadIndex < nbPairs)
+		{
+
+			//printf("globalThreadIndex %i nbPairs %i\n", globalThreadIndex, nbPairs);
+			// Pair, stackPtr in convexMeshMidphase kernel
+			curPair = sPairsGPU[globalThreadIndex];
+
+			const PxU32 cmIdx = curPair.x;
+
+			PxgContactManagerInput npWorkItem = cmInputs[cmIdx];
+
+
+			const PxgShape* shape0 = &gpuShapes[npWorkItem.shapeRef0];
+			const PxgShape* trimeshShape = &gpuShapes[npWorkItem.shapeRef1];
+			PxGeometryType::Enum type0 = PxGeometryType::Enum(shape0->type);
+
+			
+			bool flip = type0 == PxGeometryType::eTRIANGLEMESH;
+
+			if (flip)
+			{
+				PxSwap(npWorkItem.shapeRef0, npWorkItem.shapeRef1);
+				PxSwap(npWorkItem.transformCacheRef0, npWorkItem.transformCacheRef1);
+
+				PxSwap(shape0, trimeshShape);
+
+				type0 = PxGeometryType::Enum(shape0->type);
+			}
+
+			if (type0 == PxGeometryType::eSPHERE)
+			{
+				nbContacts = sphereTrimeshNarrowphaseCore(
+					globalThreadIndex,
+
+					npWorkItem,
+					*shape0,
+					*trimeshShape,
+					curPair,
+					transformCache,
+					contactDistance,
+					sphereTrimeshPair,
+					&sSphereTriNIPtr,
+					&sSphereTriContactsPtr,
+					&sSphereTriMaxDepthPtr,
+					&sSphereTriIntermPtr,
+					&sOrderedSphereTriIntermPtr,
+					&sSphereTriSecondPassPairsPtr,
+					nbPairsGlobal,
+					nbSecondPassPairs,
+					shContacts[threadIdx.x*2],
+					shSeparations[threadIdx.x*2],
+					outMask
+				);
+			}
+			else
+			{
+				nbContacts = capsuleTrimeshNarrowphaseCore(
+					globalThreadIndex,
+					npWorkItem,
+					*shape0,
+					*trimeshShape,
+					curPair,
+					transformCache,
+					contactDistance,
+					sphereTrimeshPair,
+					&sSphereTriNIPtr,
+					&sSphereTriContactsPtr,
+					&sSphereTriMaxDepthPtr,
+					&sSphereTriIntermPtr,
+					&sOrderedSphereTriIntermPtr,
+					&sSphereTriSecondPassPairsPtr,
+					nbSecondPassPairs,
+					&shContacts[threadIdx.x * 2],
+					&shSeparations[threadIdx.x * 2],
+					outMask
+				);
+
+			}
+		}
+
+		PxU32 outMaskWriteMask = __ballot_sync(FULL_MASK, outMask);
+		PxU32 count = __popc(outMaskWriteMask);
+		PxU32 startIndex = 0;
+
+		const PxU32 threadIndexInWarp = threadIdx.x & 31;
+
+		//Now output contacts!!!!
+
+		PxU32 inclusiveSum = warpScan<AddOpPxU32, PxU32>(FULL_MASK, nbContacts);
+
+		PxU32 totalContacts = __shfl_sync(FULL_MASK, inclusiveSum, 31);
+
+		if (totalContacts)
+		{
+			if (count)
+			{
+				if (threadIndexInWarp == 0)
+					startIndex = atomicAdd(nbSecondPassPairs, count);
+
+				startIndex = __shfl_sync(FULL_MASK, startIndex, 0);
+
+				if (outMask)
+				{
+					PxU32 offset = warpScanExclusive(outMaskWriteMask, threadIndexInWarp);
+					sSphereTriSecondPassPairsPtr[startIndex + offset] = outMask;
+				}
+			}
+
+			if (threadIndexInWarp == 31)
+				startIndex = atomicAdd(pTempContactIndex, inclusiveSum);
+
+			startIndex = __shfl_sync(FULL_MASK, startIndex, 31);
+
+			startIndex += inclusiveSum - nbContacts;
+
+			for (PxU32 i = 0, idx = 2*threadIdx.x; i < nbContacts; ++i, idx++)
+			{
+				tempConvexTriContacts[startIndex + i].contact_sepW = make_float4(shContacts[idx].x, shContacts[idx].y,
+					shContacts[idx].z, shSeparations[idx]);
+			}
+
+			if (nbContacts)
+			{
+				PxU32 cmIdx = curPair.x;
+				PxU32 testOffset = curPair.z;
+
+				SphereMeshPair& pair = sphereTrimeshPair[cmIdx];
+				PxU32 sphereTriPairOffset = pair.startIndex + testOffset;
+
+				sSphereTriContactsPtr[sphereTriPairOffset].index = startIndex;
+			}
+		}
+
+
+
+	}
+}
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/convexMeshCorrelate.cu
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/convexMeshCorrelate.cu
@@ -0,0 +1,379 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "foundation/PxMath.h"
+#include "foundation/PxVec3.h"
+#include "foundation/PxTransform.h"
+
+#include "AlignedTransform.h"
+
+#include "PxgContactManager.h"
+#include "PxgNpKernelIndices.h"
+#include "PxgPersistentContactManifold.h"
+#include "PxsContactManagerState.h"
+
+#include "PxgCommonDefines.h"
+#include "reduction.cuh"
+#include "contactReduction.cuh"
+
+using namespace physx;
+
+extern "C" __host__ void initNarrowphaseKernels3() {}
+
+#include "manifold.cuh"
+#include "nputils.cuh"
+
+
+#include "cudaNpCommon.h"
+#include "convexNpCommon.h"
+
+
+//This method compacts the depthBuffer + nicBuffer to only include the set of triangles that actually have contacts.
+//These then must be sorted based on depth values to construct patches, then appended and then output
+__device__ PX_FORCE_INLINE
+int purgeEmptyTriangles(const PxU32 syncMask, PxReal* depthBuffer, ConvexTriNormalAndIndex* nicBuffer, int count, PxU32* tempCounts)
+{
+	const int tI = threadIdx.x;
+
+	PxU32 nbTriangles = 0;
+
+	int loaded = 0;
+	for (; loaded < count; loaded += 32)
+	{
+		int nbToLoad = PxMin(count - loaded, 32);
+
+		PxReal d = tI < nbToLoad ? depthBuffer[loaded + tI] : FLT_MAX;
+		PxU32 nbContacts = nbToLoad ? ConvexTriNormalAndIndex::getNbContacts(nicBuffer[loaded + tI].index) : 0;
+		int valid = d != FLT_MAX && nbContacts > 0;
+		int validBits = __ballot_sync(syncMask, valid);
+		int dest = __popc(validBits & ((1 << tI) - 1)) + nbTriangles;
+
+		if (valid)
+		{
+			depthBuffer[dest] = d;
+			tempCounts[dest] = ((loaded + tI) << 4) | nbContacts;
+			nicBuffer[dest] = nicBuffer[loaded + tI];
+		}
+
+		nbTriangles += __popc(validBits);
+	}
+
+	return nbTriangles;
+}
+
+struct Scratch
+{
+	volatile ConvexMeshPair pair;
+	int pairIndex;
+};
+
+__device__
+void correlate(
+	const ConvexMeshPair* PX_RESTRICT				meshPairBuffer,
+	ConvexTriNormalAndIndex ** PX_RESTRICT			nicBufferPtr,
+	const ConvexTriContacts ** PX_RESTRICT			contactBufferPtr,
+	PxReal ** PX_RESTRICT							depthBufferPtr,
+	PxU32* PX_RESTRICT								tempCounts,
+	Scratch* PX_RESTRICT							s,
+	PxgPersistentContactMultiManifold* PX_RESTRICT	outBuffer,
+	PxsContactManagerOutput* PX_RESTRICT			cmOutputs,
+	const PxU32										numPairs,
+	const PxReal									clusterBias,
+	ConvexTriContact* PX_RESTRICT					tempConvexTriContacts,
+	PxU32* PX_RESTRICT								pTempContactIndex
+)
+{
+	__shared__ PxU32 triangleRunSums[CORRELATE_WARPS_PER_BLOCK][WARP_SIZE];
+	__shared__ PxU32 startIndices[CORRELATE_WARPS_PER_BLOCK][WARP_SIZE];
+	__shared__ PxU32 triIndices[CORRELATE_WARPS_PER_BLOCK][WARP_SIZE];
+
+	PxU32* triRunSums = triangleRunSums[threadIdx.y];
+	PxU32* triStartIndex = startIndices[threadIdx.y];
+	PxU32* triInds = triIndices[threadIdx.y];
+
+	const int pairIndex = threadIdx.y + CORRELATE_WARPS_PER_BLOCK * blockIdx.x;
+
+	if (pairIndex >= numPairs)
+		return;
+
+	s->pairIndex = pairIndex;
+	const int tI = threadIdx.x;
+	if (tI < sizeof(ConvexMeshPair) / 4)
+		((int*)(&s->pair))[tI] = ((int*)(meshPairBuffer + pairIndex))[tI];
+
+	__syncwarp();
+
+	int start = s->pair.startIndex, count = s->pair.count;
+
+	if (count == CONVEX_TRIMESH_CACHED)
+	{
+		// We've hit cache, nothing to do here
+		return;
+	}
+
+	ConvexTriNormalAndIndex* PX_RESTRICT nicBuffer = *nicBufferPtr;
+	const ConvexTriContacts* PX_RESTRICT contactBuffer = *contactBufferPtr;
+	PxReal* PX_RESTRICT depthBuffer = *depthBufferPtr;
+
+
+	PxU32 nbTriangles = purgeEmptyTriangles(FULL_MASK, depthBuffer + start, nicBuffer + start, count, tempCounts + start);
+
+	__syncwarp();
+
+
+	if (nbTriangles == 0)
+	{
+		outBuffer[pairIndex].mNbManifolds = 0;
+		return;
+	}
+
+	//Now, we loop creating patches and contacts from the triangles that actually have contacts!
+
+	PxU32 nbPatches = 0;
+	PxU32 numContactsTotal = 0;
+
+	PxU32 remainingTriangles = nbTriangles;
+
+
+
+	for (; nbPatches < MULTIMANIFOLD_MAX_MANIFOLDS && remainingTriangles != 0; nbPatches++)
+	{
+		//We loop through all triangles, load contacts, compress etc...
+		//(1) Find deepest triangle...
+
+
+		int bestIndex = 0;
+		PxReal bestDepth = PX_MAX_F32;
+		for (PxU32 i = threadIdx.x; i < nbTriangles; i += 32)
+		{
+			if (i < nbTriangles)
+			{
+				if (tempCounts[start + i]) //Still has contacts!
+				{
+					PxReal d = depthBuffer[start + i];
+					if (d < bestDepth)
+					{
+						bestDepth = d;
+						bestIndex = start + i;
+					}
+				}
+			}
+		}
+
+		int i0 = minIndex(bestDepth, FULL_MASK, bestDepth);
+
+		bestIndex = __shfl_sync(FULL_MASK, bestIndex, i0);
+
+		//Now we know which triangle defines this plane...
+
+		PxVec3 bestNormal = nicBuffer[bestIndex].normal;
+
+		PxU32 currentContactMask = 0;
+
+
+		PxVec3 pA(0.f);
+		PxReal separation = 0.f;
+		PxU32 contactTriangleIndex = 0;
+		bool hasContact = false;
+
+		for (PxU32 i = 0; i < nbTriangles; i += 32)
+		{
+			PxVec3 normal(0.f);
+			PxU32 triIndex = 0;
+			PxU32 nbContacts = 0;
+			PxU32 startIndex = 0;
+
+			if ((i + threadIdx.x) < nbTriangles)
+			{
+				PxU32 index = tempCounts[start + i + threadIdx.x];
+				//We read contacts from the temp contacts buffer because this will get cleared when the contacts are considered
+				//for a patch...
+				nbContacts = index & 7;
+				startIndex = index >> 4;
+				if (nbContacts)
+				{
+					ConvexTriNormalAndIndex nic = nicBuffer[start + i + threadIdx.x];
+					normal = nic.normal;
+					triIndex = ConvexTriNormalAndIndex::getTriangleIndex(nic.index);
+				}
+			}
+
+			bool accept = normal.dot(bestNormal) > PATCH_ACCEPTANCE_EPS;
+
+			if(!accept)
+				nbContacts = 0;
+
+
+			PxU32 acceptMask = __ballot_sync(FULL_MASK, accept);
+
+			remainingTriangles -= __popc(acceptMask);
+
+			PxU32 summedContacts = warpScan<AddOpPxU32, PxU32>(FULL_MASK, nbContacts);
+
+			triRunSums[threadIdx.x] = summedContacts - nbContacts;
+			triStartIndex[threadIdx.x] = startIndex;
+			triInds[threadIdx.x] = triIndex;
+
+			PxU32 totalContacts = __shfl_sync(FULL_MASK, summedContacts, 31);
+			__syncwarp();
+
+			for (PxU32 c = 0; c < totalContacts;)
+			{
+				//calculate my read index (where I'm offset from...)
+				PxU32 readMask = ~currentContactMask;
+				bool needsContact = readMask & (1 << threadIdx.x);
+				PxU32 readIndex = c + warpScanExclusive(readMask, threadIdx.x);
+
+				if (needsContact && readIndex < totalContacts)
+				{
+					PxU32 idx = binarySearch<PxU32>(triRunSums, 32u, readIndex);
+
+					PxU32 offset = readIndex - triRunSums[idx];
+
+					PxU32 startIdx = triStartIndex[idx] + start;
+
+					const PxU32 contactStartIdx = contactBuffer[startIdx].index;
+
+
+					float4 v = tempConvexTriContacts[contactStartIdx + offset].contact_sepW;
+
+					pA = PxVec3(v.x, v.y, v.z);
+					separation = v.w;
+					contactTriangleIndex = triInds[idx];
+					hasContact = true;
+				}
+
+				currentContactMask = __ballot_sync(FULL_MASK, hasContact); //Mask every thread that has a contact!
+
+				c += __popc(readMask);
+				PxU32 nbContacts = __popc(currentContactMask);
+
+				if (nbContacts > SUBMANIFOLD_MAX_CONTACTS)
+				{
+					currentContactMask = contactReduce<true, false, SUBMANIFOLD_MAX_CONTACTS, false>(pA, separation, bestNormal, currentContactMask, clusterBias);
+				}
+				hasContact = currentContactMask & (1 << threadIdx.x);
+			}
+
+			__syncwarp(); //triRunSums is read and written in the same loop - separate read and write with syncs
+
+			if (accept)
+			{
+				tempCounts[start + i + threadIdx.x] = 0; //Clear this triangle - it is done!
+			}
+
+		}
+
+		__syncwarp();
+		PxU32 nbContacts = __popc(currentContactMask);
+
+		//When we reach here, we have handled all the triangles, so we should output this patch...
+		PxgContact * cp = outBuffer[s->pair.cmIndex].mContacts[nbPatches] + warpScanExclusive(currentContactMask, tI);
+
+		if (threadIdx.x == 0)
+			outBuffer[s->pair.cmIndex].mNbContacts[nbPatches] = nbContacts;
+
+		numContactsTotal += nbContacts;
+		if (currentContactMask & (1 << tI))
+		{
+			PxVec3 nor = ldS(s->pair.aToB).rotate(bestNormal);
+
+			cp->normal = -nor;
+			cp->pointA = pA;
+			// TODO: PxTransform::transform is incredibly profligate with registers (i.e. the write-out phase uses 
+			// more regs than the contact culling phase). Better would be to stash the transform in matrix form 
+			// when not under register pressure (i.e. when initially loading it) then use a custom shmem
+			//  matrix multiply, which requires essentially no tmps
+
+			PxVec3 pointB = ldS(s->pair.aToB).transform(pA) + nor * separation;
+
+			cp->pointB = pointB;
+			cp->penetration = separation;
+			cp->triIndex = contactTriangleIndex;
+
+			/*printf("cp normal(%f, %f, %f)\n", cp->normal.x, cp->normal.y, cp->normal.z);
+			printf("cp pointA(%f, %f, %f)\n", pA.x, pA.y, pA.z);
+			printf("cp pointB(%f, %f, %f)\n", pointB.x, pointB.y, pointB.z);*/
+
+		}
+
+	}
+
+	if (nbPatches > 0)
+	{
+		// TODO avoroshilov: decide what to do with aligned transforms
+		PxAlignedTransform aToB_aligned;// = s->pair.aToB;
+		aToB_aligned.p.x = s->pair.aToB.p.x;
+		aToB_aligned.p.y = s->pair.aToB.p.y;
+		aToB_aligned.p.z = s->pair.aToB.p.z;
+
+		aToB_aligned.q.q.x = s->pair.aToB.q.x;
+		aToB_aligned.q.q.y = s->pair.aToB.q.y;
+		aToB_aligned.q.q.z = s->pair.aToB.q.z;
+		aToB_aligned.q.q.w = s->pair.aToB.q.w;
+
+		PxAlignedTransform_WriteWarp(&outBuffer[s->pair.cmIndex].mRelativeTransform, aToB_aligned);
+	}
+
+	outBuffer[s->pair.cmIndex].mNbManifolds = nbPatches;
+}
+
+
+extern "C" __global__
+void convexTrimeshCorrelate(
+	const ConvexMeshPair * PX_RESTRICT				pairs,
+	PxReal ** PX_RESTRICT							depthsPtr,
+	ConvexTriNormalAndIndex ** PX_RESTRICT			normalAndIndexPtr,
+	const ConvexTriContacts ** PX_RESTRICT			contactsPtr,
+
+	PxgPersistentContactMultiManifold * PX_RESTRICT	output,
+	PxsContactManagerOutput * PX_RESTRICT			cmOutputs,
+	PxU32* PX_RESTRICT								tempCountAndOffsetBuffer,
+	const PxU32										numPairs,
+	const PxReal									clusterBias,
+	ConvexTriContact* PX_RESTRICT					tempConvexTriContacts,
+	PxU32*											pTempContactIndex
+)
+{
+	const int warpIndex = threadIdx.y;
+	__shared__ char scratch[sizeof(Scratch) *  CORRELATE_WARPS_PER_BLOCK];
+	correlate(
+		pairs,
+		normalAndIndexPtr,
+		contactsPtr,
+		depthsPtr,
+		tempCountAndOffsetBuffer,
+		((Scratch*)&scratch) + warpIndex,
+		output,
+		cmOutputs,
+		numPairs,
+		clusterBias,
+		tempConvexTriContacts,
+		pTempContactIndex
+	);
+}
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/convexMeshMidphase.cu
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/convexMeshMidphase.cu
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/convexMeshOutput.cu
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/convexMeshOutput.cu
@@ -0,0 +1,347 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "foundation/PxAssert.h"
+#include "foundation/PxVec3.h"
+#include "PxContact.h"
+
+#include "PxsMaterialCore.h"
+#include "PxsContactManagerState.h"
+#include "PxgPersistentContactManifold.h"
+#include "PxgConvexConvexShape.h"
+#include "PxgContactManager.h"
+#include "cudaNpCommon.h"
+#include "PxsTransformCache.h"
+#include "dataReadWriteHelper.cuh"
+#include "materialCombiner.cuh"
+#include "geometry/PxGeometry.h"
+
+#include <assert.h>
+
+using namespace physx;
+
+extern "C" __host__ void initNarrowphaseKernels5() {}
+
+__device__ void writeCompressedContact(
+	PxContactPatch* PX_RESTRICT patches, PxU32 patchWriteIndex, PxU32 contactWriteIndex,
+	const PxU32 nbContacts,
+	const PxVec3 normal,
+	const PxU32 materialIndex0, const PxU32 materialIndex1,
+	const PxsMaterialData* PX_RESTRICT materials
+)
+{
+	PxReal staticFriction, dynamicFriction, combinedRestitution, combinedDamping;
+	PxU32 materialFlags;
+
+	combineMaterials(materials, static_cast<PxU16>(materialIndex0), static_cast<PxU16>(materialIndex1),
+		materialFlags, staticFriction, dynamicFriction,
+		combinedRestitution, combinedDamping);
+
+	PxContactPatch& patch = patches[patchWriteIndex];
+	patch.normal = normal;
+	patch.nbContacts = nbContacts;
+	patch.startContactIndex = contactWriteIndex;
+	//KS - we could probably compress this further into the header but the complexity might not be worth it
+	patch.staticFriction = staticFriction;
+	patch.dynamicFriction = dynamicFriction;
+	patch.restitution = combinedRestitution;
+	patch.damping = combinedDamping;
+	patch.materialIndex0 = materialIndex0;
+	patch.materialIndex1 = materialIndex1;
+	patch.materialFlags = static_cast<PxU8>(materialFlags);
+	patch.mMassModification.linear0 = 1.f;
+	patch.mMassModification.linear1 = 1.f;
+	patch.mMassModification.angular0 = 1.f;
+	patch.mMassModification.angular1 = 1.f;
+	patch.internalFlags = PxContactPatch::eHAS_FACE_INDICES;
+}
+
+__device__ inline float getAvgContactValue(float val, PxU32 nbContacts)
+{
+	PxReal avgValue = val;
+
+	#pragma unroll 3
+	for (PxU32 reductionRadius = 4; reductionRadius > 0; reductionRadius >>= 1)
+	{
+		PxReal tmp = __shfl_down_sync(FULL_MASK, avgValue, reductionRadius);
+  
+		if (reductionRadius < nbContacts)
+			avgValue += tmp;
+	}
+
+	return avgValue / PxReal(nbContacts);
+}
+
+//calculate patch normal by average each normal and transform the normal into world space
+__device__ static inline PxVec3 getWorldNormal(
+	const PxgContact* contacts, const PxTransform& meshTransform,
+	const PxU32 numContacts, const PxU32 manifoldIndex, const PxU32 threadIndexInWarp, const PxU32 threadIndexInManifold
+)
+{
+	PxVec3 normal = (threadIndexInManifold >= numContacts) ? PxVec3(0, 0, 0) : contacts[0].normal;
+
+	//transform into world space
+	PxVec3 unNormalizedNormal = meshTransform.rotate(normal);
+
+	return unNormalizedNormal.getNormalized();
+}
+
+//Each blocks has 8 warps and each warps deal with a pair
+//max number of contacts in each PxgPersistentContactMultiManifold is 24(maxinum 4 single manifold and maxinum 6 contacts in each manifold )
+extern "C" __global__
+void convexTrimeshFinishContacts(
+								const ConvexMeshPair *	pairs,
+								const PxsCachedTransform * transformCache,
+								const PxgShape* PX_RESTRICT gpuShapes,
+								const PxgContactManagerInput * cmInputs,
+								PxsContactManagerOutput * cmOutputs,
+								const PxgPersistentContactMultiManifold* cmMultiManifold,
+								const PxU32 numPairs,
+								const PxsMaterialData* PX_RESTRICT materials,
+								PxU8* PX_RESTRICT contactStream,//contact and pen
+								PxU8* PX_RESTRICT patchStream, // normal and friction
+								PxU8* PX_RESTRICT forceAndIndiceStream, //triangle(face) index is appended after the froce stream
+								bool insertAveragePoint,
+								PxgPatchAndContactCounters* PX_RESTRICT patchAndContactCounters,
+								PxU32* touchChangeFlags,
+								PxU32* patchChangeFlags,
+								PxU8* startContactPatches,
+								PxU8* startContactPoints,
+								PxU8* startContactForces,
+								PxU32 patchBytesLimit,
+								PxU32 contactBytesLimit,
+								PxU32 forceBytesLimit
+	)
+{
+	const PxU32 globalThreadIndex = threadIdx.x + blockIdx.x * blockDim.x;
+
+	const PxU32 globalWarpIndex = globalThreadIndex / WARP_SIZE;
+	if(globalWarpIndex >= numPairs)
+		return;
+
+	
+	const PxgPersistentContactMultiManifold& multiManifold = cmMultiManifold[globalWarpIndex];
+	PxU32 nbManifolds = multiManifold.mNbManifolds;
+
+	const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE - 1);
+	const PxU32 singleManifoldIndex = threadIndexInWarp / PXG_SUBMANIFOLD_MAX_CONTACTS;
+	const PxU32 threadIndexInManifold = threadIndexInWarp % PXG_SUBMANIFOLD_MAX_CONTACTS;
+
+	const PxU32 numContacts = (singleManifoldIndex >= nbManifolds) ? 0 : multiManifold.mNbContacts[singleManifoldIndex];
+
+	const bool hasContacts = threadIndexInManifold < numContacts;
+
+	PxU32 contactMask = __ballot_sync(FULL_MASK, (PxU32) hasContacts);
+	PxU32 totalNumContacts = __popc(contactMask);
+	
+	const PxU32 writeIndex = warpScanExclusive(contactMask, threadIndexInWarp); //tells me how many threads preceding me had contacts (i.e. what my write index will be)
+
+	
+	//Each single manifold should be one patch
+	PxU32 patchByteOffset = 0xFFFFFFFF;
+	PxU32 contactByteOffset = 0xFFFFFFFF;
+	PxU32 forceAndIndiceByteOffset = 0xFFFFFFFF;
+
+	PxsContactManagerOutput* output = cmOutputs + globalWarpIndex;
+
+	PxU32 allflags = reinterpret_cast<PxU32*>(&output->allflagsStart)[0];
+	PxU8 oldStatusFlags = u16Low(u32High(allflags));
+	PxU8 statusFlags = oldStatusFlags;
+
+	statusFlags &= (~PxsContactManagerStatusFlag::eTOUCH_KNOWN);
+	
+	if (totalNumContacts != 0)
+		statusFlags |= PxsContactManagerStatusFlag::eHAS_TOUCH;
+	else
+		statusFlags |= PxsContactManagerStatusFlag::eHAS_NO_TOUCH;
+
+	PxU8 prevPatches = u16High(u32Low(allflags)); //Get out the current number of patches to store as the previous frame's number of patches
+	
+	bool overflow = false;
+
+
+	if (threadIndexInWarp == 0 )
+	{
+		PxU32 nbInsertAveragePoint = insertAveragePoint ? nbManifolds : 0;
+		totalNumContacts = totalNumContacts + nbInsertAveragePoint;
+
+		if (totalNumContacts)
+		{
+			patchByteOffset = atomicAdd(&(patchAndContactCounters->patchesBytes), sizeof(PxContactPatch)*nbManifolds);
+			contactByteOffset = atomicAdd(&(patchAndContactCounters->contactsBytes), sizeof(PxContact) * totalNumContacts);
+			forceAndIndiceByteOffset = atomicAdd(&(patchAndContactCounters->forceAndIndiceBytes), sizeof(PxU32) * totalNumContacts * 2);
+
+			if ((patchByteOffset + sizeof(PxContactPatch) * nbManifolds) > patchBytesLimit)
+			{
+				patchAndContactCounters->setOverflowError(PxgPatchAndContactCounters::PATCH_BUFFER_OVERFLOW);
+				patchByteOffset = 0xFFFFFFFF;
+				overflow = true;
+			}
+			else if ((contactByteOffset + sizeof(PxContact) * totalNumContacts) > contactBytesLimit)
+			{
+				patchAndContactCounters->setOverflowError(PxgPatchAndContactCounters::CONTACT_BUFFER_OVERFLOW);
+				contactByteOffset = 0xFFFFFFFF;
+				overflow = true;
+			}
+			else if ((forceAndIndiceByteOffset + sizeof(PxU32) * totalNumContacts * 2) > forceBytesLimit)
+			{
+				patchAndContactCounters->setOverflowError(PxgPatchAndContactCounters::FORCE_BUFFER_OVERFLOW);
+				forceAndIndiceByteOffset = 0xFFFFFFFF;
+				overflow = true;
+			}
+
+			if (overflow)
+			{
+				nbManifolds = totalNumContacts =  0;
+
+				statusFlags &= (~PxsContactManagerStatusFlag::eTOUCH_KNOWN);
+				statusFlags |= PxsContactManagerStatusFlag::eHAS_NO_TOUCH;
+			}
+
+			//printf("threadIdex0 nbManifolds %i\n", nbManifolds);
+		}
+	
+
+		bool previouslyHadTouch = oldStatusFlags & PxsContactManagerStatusFlag::eHAS_TOUCH;
+		bool prevTouchKnown = oldStatusFlags & PxsContactManagerStatusFlag::eTOUCH_KNOWN;
+		bool currentlyHasTouch = nbManifolds != 0;
+
+		const bool change = (previouslyHadTouch ^ currentlyHasTouch) || (!prevTouchKnown);
+		touchChangeFlags[globalWarpIndex] = change;
+		patchChangeFlags[globalWarpIndex] = (prevPatches != nbManifolds);
+	
+		assert(totalNumContacts < 100);
+		reinterpret_cast<PxU32*>(&output->allflagsStart)[0] = merge(merge(prevPatches, statusFlags),
+													merge(nbManifolds, PxU8(0)));  
+		output->nbContacts = totalNumContacts;
+
+		if (!overflow)
+		{
+			output->contactForces = reinterpret_cast<PxReal*>(startContactForces + forceAndIndiceByteOffset);
+			output->contactPatches = startContactPatches + patchByteOffset;
+			output->contactPoints = startContactPoints + contactByteOffset;
+		}
+		else
+		{
+			output->contactForces = 0;
+			output->contactPatches = 0;
+			output->contactPoints = 0;
+		}
+	}
+
+	if(__any_sync(FULL_MASK, overflow) || nbManifolds == 0)
+		return;
+
+
+	//const uint4 npWorkItem = *(reinterpret_cast<const uint4*>(cmInputs) + globalWarpIndex);
+
+	PxgContactManagerInput npWorkItem;
+	PxgContactManagerInput_ReadWarp(npWorkItem, cmInputs, globalWarpIndex);
+
+	PxU32 shapeRef0 = npWorkItem.shapeRef0;
+	PxU32 shapeRef1 = npWorkItem.shapeRef1;
+	PxU32 transformCacheRef0 = npWorkItem.transformCacheRef0;
+	PxU32 transformCacheRef1 = npWorkItem.transformCacheRef1;
+
+	bool flip = gpuShapes[shapeRef0].type == PxGeometryType::eTRIANGLEMESH;
+
+	if (flip)
+	{
+		PxSwap(shapeRef0, shapeRef1);
+		PxSwap(transformCacheRef0, transformCacheRef1);
+	}
+
+	PxgShape shape0;
+	PxgShape_ReadWarp(shape0, gpuShapes + shapeRef0);
+
+	const uint2 materialIndices = pairs[globalWarpIndex].materialIndices;
+
+	patchByteOffset = __shfl_sync(FULL_MASK, (PxI32) patchByteOffset, 0);
+	contactByteOffset = __shfl_sync(FULL_MASK, (PxI32) contactByteOffset, 0);
+	forceAndIndiceByteOffset = __shfl_sync(FULL_MASK, (PxI32)forceAndIndiceByteOffset, 0);
+
+	PxContactPatch* patches = reinterpret_cast<PxContactPatch*>(patchStream + patchByteOffset);
+	float4* contacts = reinterpret_cast<float4*>(contactStream + contactByteOffset);
+	PxU32* faceIndex = reinterpret_cast<PxU32*>(forceAndIndiceStream + forceAndIndiceByteOffset + totalNumContacts*sizeof(PxU32));
+
+	PxsCachedTransform trimeshTransformCached; 
+	PxsCachedTransform_ReadWarp(trimeshTransformCached, transformCache + transformCacheRef1); //w is transformCacheRef1
+
+	PxsCachedTransform sphereTransformCached;
+	PxsCachedTransform_ReadWarp(sphereTransformCached, transformCache + transformCacheRef0); //w is transformCacheRef1
+
+	//calculate patch normal by average each normal and transform the normal into world space, need to optimized
+	const PxVec3 worldNormal = getWorldNormal(&multiManifold.mContacts[singleManifoldIndex][0], trimeshTransformCached.transform, numContacts, singleManifoldIndex, threadIndexInWarp, threadIndexInManifold);
+
+	if(hasContacts)
+	{
+		const PxgContact& point = multiManifold.mContacts[singleManifoldIndex][threadIndexInManifold];
+
+		if(threadIndexInManifold == 0)
+			writeCompressedContact(patches, singleManifoldIndex, writeIndex, numContacts, flip ? -worldNormal : worldNormal, materialIndices.x, materialIndices.y, materials);
+
+		assert(point.pointB.isFinite());
+
+		PxReal pen = point.penetration;
+		PxVec3 worldPt;
+		if (shape0.type == PxGeometryType::eSPHERE)
+		{
+			const PxReal radius = shape0.scale.scale.y;
+			pen = pen - radius;
+			worldPt = sphereTransformCached.transform.p - worldNormal * radius;
+		}
+		else if (shape0.type == PxGeometryType::eCAPSULE)
+		{
+			const PxReal radius = shape0.scale.scale.y;
+			pen = pen - radius;
+			worldPt = sphereTransformCached.transform.transform(point.pointA) - worldNormal * radius;
+		}
+		else
+		{
+			worldPt = trimeshTransformCached.transform.transform(point.pointB);
+		}
+
+		//contacts[writeIndex] = make_float4(point.pointB.x, point.pointB.y, point.pointB.z, -point.penetration);
+		if (contactByteOffset != 0xFFFFFFFF)
+			contacts[writeIndex] = make_float4(worldPt.x, worldPt.y, worldPt.z, pen);
+
+		/*if (threadIndexInWarp == 0)
+		{
+			printf("triIndex %i worldPt(%f, %f, %f)\n", point.triIndex, worldPt.x, worldPt.y, worldPt.z);
+			printf("triIndex %i worldNormal(%f, %f, %f)\n", point.triIndex, worldNormal.x, worldNormal.y, worldNormal.z);
+			printf("triIndex %i pen %f radius %f\n", point.triIndex, pen, shape0.scale.scale.x);
+			PxVec3 p = sphereTransformCached.transform.p;
+			PxQuat q = sphereTransformCached.transform.q;
+			printf("sphereTransform p(%f, %f, %f), q(%f, %f, %f, %f)\n", p.x, p.y, p.z, q.x, q.y, q.z, q.w);
+		}*/
+
+		if(forceAndIndiceByteOffset != 0xFFFFFFFF)
+			faceIndex[writeIndex] = point.triIndex;
+ 	}
+}
+
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/convexMeshPostProcess.cu
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/convexMeshPostProcess.cu
@@ -0,0 +1,649 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "cuda.h"
+#include "convexNpCommon.h"
+#include "triangle.cuh"
+#include "dataReadWriteHelper.cuh"
+#include "RadixSort.cuh"
+#include "reduction.cuh"
+#include "geometry/PxGeometry.h"
+#include "heightfieldUtil.cuh"
+#include "convexTriangle.cuh"
+#include "assert.h"
+#include "stdio.h"
+#include "PxgNpKernelIndices.h"
+
+using namespace physx;
+
+extern "C" __host__ void initNarrowphaseKernels8() {}
+
+////Based on edge edge checks
+//__device__ void convexTrimeshPostProcess(
+//	ConvexMeshPair* PX_RESTRICT cvxTrimeshPair,						// per CM
+//	ConvexTriNormalAndIndex** PX_RESTRICT cvxTriNIPtr,				// per cvx-tri
+//	ConvexTriEdgeBuffer** PX_RESTRICT cvxTriEdgeBufferPtr,			// per cvx-tri
+//	PxU32** PX_RESTRICT cvxTriSecondPassPairsPtr,					// per cvx-tri
+//	const uint4* PX_RESTRICT pairsGPU,
+//	PxU32* PX_RESTRICT nbSecondPassPairs,
+//	const PxU32 globalWarpIndex)
+//{
+//	PxU32* cvxTriSecondPassPairs = *cvxTriSecondPassPairsPtr;
+//	ConvexTriNormalAndIndex* cvxTriNI = *cvxTriNIPtr;
+//
+//	const PxU32 pairIdx = cvxTriSecondPassPairs[globalWarpIndex];
+//
+//	uint4 curPair = pairsGPU[pairIdx];
+//
+//	PxU32 cmIdx = curPair.x;
+//	PxU32 testOffset = curPair.z;
+//
+//	ConvexMeshPair& pair = cvxTrimeshPair[cmIdx];
+//	const PxU32 nbPairsPerCM = pair.count;
+//	const PxU32 pairStartIndex = pair.startIndex;
+//	PxU32 convexTriPairOffset = pairStartIndex + testOffset;
+//
+//	ConvexTriNormalAndIndex& cvxni = cvxTriNI[convexTriPairOffset];
+//	//get the number of contacts
+//	const PxU32 nbContacts = cvxni.index >> 28;
+//
+//	if (nbContacts == 0)
+//		return;
+//
+//	ConvexTriEdgeBuffer* cvxTriEdgeBuffer = *cvxTriEdgeBufferPtr;
+//
+//	ConvexTriEdgeBuffer&  cvxTriEdge = cvxTriEdgeBuffer[convexTriPairOffset];
+//
+//	//load the verts into shared memory
+//	__shared__ int sVertsIds[2][3];
+//
+//	if (threadIdx.x < 3)
+//	{
+//		sVertsIds[threadIdx.y][threadIdx.x] = cvxTriEdge.triVertexIds[threadIdx.x];
+//	}
+//
+//	__threadfence_block();
+//
+//	bool foundTriangle = false;
+//	//each triangle has 3 edges, there are 9 combinations. Each thread work with one edge vs another edge
+//	for (PxU32 i = 0; i < nbPairsPerCM * 9; i += WARP_SIZE)
+//	{
+//		foundTriangle = false;
+//		const PxU32 index = i + threadIdx.x;
+//
+//		if (index < nbPairsPerCM * 9)
+//		{
+//
+//			const PxU32 offset = index / 9;
+//
+//			//same triangle, we don't need to compare the same triangle
+//			if (offset == testOffset)
+//				continue;
+//
+//			const PxU32 vStartIndex0 = index % 3;
+//			const PxU32 vEndIndex0 = (vStartIndex0 + 1) % 3;
+//
+//			const PxU32 vertId00 = sVertsIds[threadIdx.y][vStartIndex0];
+//			const PxU32 vertId01 = sVertsIds[threadIdx.y][vEndIndex0];
+//
+//			const PxU32 vOffset = index / 3;
+//
+//			const PxU32 vStartIndex1 = (vEndIndex0 + vOffset) % 3;
+//			const PxU32 vEndIndex1 = (vStartIndex1 + 1) % 3;
+//
+//			const PxU32 neighbourOffset = pairStartIndex + offset;
+//
+//			ConvexTriNormalAndIndex& nCvxni = cvxTriNI[neighbourOffset];
+//			//get the number of contacts
+//			const PxU32 nbGeneratedContacts = nCvxni.index >> 28;
+//
+//			if (nbGeneratedContacts > 0)
+//			{
+//				const PxU32 vertId10 = cvxTriEdgeBuffer[neighbourOffset].triVertexIds[vStartIndex1];
+//				const PxU32 vertId11 = cvxTriEdgeBuffer[neighbourOffset].triVertexIds[vEndIndex1];
+//
+//				if ((vertId00 == vertId10 || vertId00 == vertId11) &&
+//					(vertId01 == vertId10 || vertId01 == vertId11))
+//				{
+//					foundTriangle = true;
+//				}
+//			}
+//		}
+//
+//		if (__any(foundTriangle))
+//		{
+//			break;
+//		}
+//	}
+//
+//	if (__any(foundTriangle))
+//	{
+//		//we don't need to generate contacts for this pair. However, we have already generated contacts in
+//		//the convexTriangleContactGen kernel. Therefore, we just need to clear the number of contacts in
+//		//the corresponding ConvexTriNormalAndIndex
+//		cvxni.index = 0 << 28;
+//	}
+//
+//}
+
+__device__ bool findTriangle(const PxU32* PX_RESTRICT orderedTriangleIndices, const PxU32 targetIndexWithMask, PxU32 count)
+{
+	PxU32 targetIndex = targetIndexWithMask | 0x80000000;
+
+	PxU32 index = binarySearch(orderedTriangleIndices, count, targetIndex);
+
+	return index < count && orderedTriangleIndices[index] == targetIndex;
+}
+
+__device__ bool findFaceContactTriangle(const PxU32* PX_RESTRICT orderedTriangleIndices, const PxU32 targetIndexWithMask, PxU32 count)
+{
+	PxU32 targetIndex = targetIndexWithMask | TriangleSearchFlags::eFACE_CONTACT;
+
+	PxU32 index = binarySearch(orderedTriangleIndices, count, targetIndex);
+
+	return index < count && orderedTriangleIndices[index] == targetIndex;
+}
+
+__device__ bool findEdgeContactTriangle(const PxU32* PX_RESTRICT orderedTriangleIndices, const PxU32 targetIndexWithMask, PxU32 count)
+{
+	PxU32 targetIndex = targetIndexWithMask | TriangleSearchFlags::eEDGE_CONTACT;
+
+	PxU32 index = binarySearch(orderedTriangleIndices, count, targetIndex);
+
+	return index < count && orderedTriangleIndices[index] == targetIndex;
+}
+
+//Based on neighbour face checks
+__device__ void convexTrimeshPostProcessCore(
+	ConvexMeshPair* PX_RESTRICT cvxTrimeshPair,							// per CM
+	ConvexTriNormalAndIndex** PX_RESTRICT cvxTriNIPtr,					// per cvx-tri
+	ConvexTriIntermediateData** PX_RESTRICT cvxTriIntermDataPtr,		// per cvx-tri
+	const PxU32** PX_RESTRICT orderedCvxTriIntermDataPtr,	// per cvx-tri
+	PxU32** PX_RESTRICT cvxTriSecondPassPairsPtr,						// per cvx-tri
+	PxgShape* PX_RESTRICT gpuShapes,
+	const uint4* PX_RESTRICT pairsGPU,
+	PxU32* PX_RESTRICT nbSecondPassPairs,
+	const PxU32 globalThreadIndex)
+{
+	
+	PxU32* cvxTriSecondPassPairs = *cvxTriSecondPassPairsPtr;
+	ConvexTriNormalAndIndex* cvxTriNI = *cvxTriNIPtr;
+
+	const PxU32 pairIdxPlusMask = cvxTriSecondPassPairs[globalThreadIndex];
+	const PxU32 pairIdx = pairIdxPlusMask & 0x1fffffff;
+
+	uint4 curPair = pairsGPU[pairIdx];
+
+	PxU32 cmIdx = curPair.x;
+	PxU32 triangleIdx = curPair.y;
+	PxU32 testOffset = curPair.z;
+	PxU32 shapeID = curPair.w;
+
+	ConvexMeshPair& pair = cvxTrimeshPair[cmIdx];
+	const PxU32 nbPairsPerCM = pair.count;
+	assert(nbPairsPerCM != 0);
+
+	const PxU32 pairStartIndex = pair.startIndex;
+	const PxU32 orderedPairStartIndex = pair.roundedStartIndex;
+	PxU32 convexTriPairOffset = pairStartIndex + testOffset;
+
+	ConvexTriNormalAndIndex& cvxni = cvxTriNI[convexTriPairOffset];
+	//get the number of contacts
+	const PxU32 nbContacts = ConvexTriNormalAndIndex::getNbContacts(cvxni.index);
+
+	if (nbContacts == 0)
+		return;
+
+	PxgShape& trimeshShape = gpuShapes[shapeID];	
+
+	assert(trimeshShape.type == PxGeometryType::eTRIANGLEMESH);
+
+	const PxU32* orderedTriIndices = (*orderedCvxTriIntermDataPtr) + orderedPairStartIndex;
+
+
+	const PxU8 * trimeshGeomPtr = reinterpret_cast<const PxU8 *>(trimeshShape.hullOrMeshPtr);
+
+	const uint4 trimesh_nbVerts_nbTris_nbAdjVertsTotal = *reinterpret_cast<const uint4 *>(trimeshGeomPtr);
+	trimeshGeomPtr += sizeof(uint4);
+
+	trimeshGeomPtr += sizeof(const Gu::BV32DataPacked)* trimesh_nbVerts_nbTris_nbAdjVertsTotal.w;
+
+	const PxU32 numVerts = trimesh_nbVerts_nbTris_nbAdjVertsTotal.x;
+	const PxU32 numTris = trimesh_nbVerts_nbTris_nbAdjVertsTotal.y;
+
+	//const float4 * trimeshVerts = reinterpret_cast<const float4 *>(trimeshGeomPtr);
+	trimeshGeomPtr += sizeof(float4)* numVerts;
+	const uint4 * trimeshTriIndices = reinterpret_cast<const uint4 *>(trimeshGeomPtr);
+	trimeshGeomPtr += sizeof(uint4)* numTris;
+
+	const uint4* PX_RESTRICT trimeshTriAdjacencies = reinterpret_cast<const uint4 *>(trimeshGeomPtr);
+	trimeshGeomPtr += sizeof(uint4)* numTris;
+
+	trimeshGeomPtr += sizeof(PxU32) * numTris; //mGRB_faceRemap
+
+	const PxU32* accumulatedTriangleRefs = reinterpret_cast<const PxU32*>(trimeshGeomPtr);
+	trimeshGeomPtr += sizeof(PxU32) * (numVerts + 1);
+
+	const PxU32* triangleReferences = reinterpret_cast<const PxU32*>(trimeshGeomPtr);
+
+
+	//ConvexTriIntermediateData* cvxTriIntermData = *cvxTriIntermDataPtr;
+
+	//ConvexTriIntermediateData&  cvxTriInterm = cvxTriIntermData[convexTriPairOffset];
+
+	uint4 adjTriangles = trimeshTriAdjacencies[triangleIdx];
+	uint4 indices = trimeshTriIndices[triangleIdx];
+
+	const bool flipNormal = trimeshShape.scale.hasNegativeDeterminant();
+	if(flipNormal)
+	{
+		PxSwap(adjTriangles.x, adjTriangles.z);
+		PxSwap(indices.y, indices.z);
+	}
+
+	PxU32 mask = pairIdxPlusMask & PxU32(ConvexTriIntermediateData::eMASK);
+
+	bool needsProcessing = false;
+
+	PxU32 triangleIndex = 0xFFFFFFFF;
+	PxU32 vertexIdx = 0xFFFFFFFF;
+	uint2 adjTriangles2;
+
+	if (mask == PxU32(ConvexTriIntermediateData::eV0))
+	{
+		vertexIdx = indices.x;
+		adjTriangles2 = make_uint2(adjTriangles.x, adjTriangles.z);
+//		printf("eV0\n");
+	}
+	else if (mask == PxU32(ConvexTriIntermediateData::eV1))
+	{
+		vertexIdx = indices.y;
+		adjTriangles2 = make_uint2(adjTriangles.x, adjTriangles.y);
+//		printf("eV1\n");
+	}
+	else if (mask == PxU32(ConvexTriIntermediateData::eV2))
+	{
+		vertexIdx = indices.z;
+		adjTriangles2 = make_uint2(adjTriangles.y, adjTriangles.z);
+//		printf("eV2\n");
+	}
+	else if (mask == PxU32(ConvexTriIntermediateData::eE01))
+	{
+		triangleIndex = adjTriangles.x;
+//		printf("eE01\n");
+		if(triangleIndex == 0xFFFFFFFF)
+			needsProcessing = true;
+	}
+	else if (mask == PxU32(ConvexTriIntermediateData::eE12))
+	{
+		triangleIndex = adjTriangles.y;
+//		printf("eE12\n");
+		if(triangleIndex == 0xFFFFFFFF)
+			needsProcessing = true;
+	}
+	else if (mask == PxU32(ConvexTriIntermediateData::eE02))
+	{
+		triangleIndex = adjTriangles.z;
+//		printf("eE02\n");
+		if(triangleIndex == 0xFFFFFFFF)
+			needsProcessing = true;
+	}
+
+	//printf("triangleIdx: %d; mask: %x; triangleIndex: %d; vertexIdx: %d\n", triangleIdx, mask, triangleIndex, vertexIdx);
+
+	if (triangleIndex != 0xFFFFFFFF)
+	{
+//		printf("CHECKPOINT 0\n");
+		needsProcessing = isBoundaryEdge(triangleIndex) || !findFaceContactTriangle(orderedTriIndices, triangleIndex, nbPairsPerCM);
+		//if (!needsProcessing)
+		//	printf("triangleIdx: %d; triangleIndex: %d\n", triangleIdx, triangleIndex);
+	}
+	else if (vertexIdx != 0xFFFFFFFF)
+	{
+//		printf("CHECKPOINT 1\n");
+		//First, let's see if either of the edges sharing this vertex are active. If they are, we keep the contacts, otherwise
+		//we look for a shared triangle...
+
+		needsProcessing = true;
+
+		if(!(isBoundaryEdge(adjTriangles2.x) && isBoundaryEdge(adjTriangles2.y)))
+		{
+			PxU32 startIdx = accumulatedTriangleRefs[vertexIdx];
+			PxU32 endIdx = accumulatedTriangleRefs[vertexIdx + 1];
+
+			for (PxU32 idx = startIdx; idx < endIdx; ++idx)
+			{
+				PxU32 triIdx = triangleReferences[idx];
+
+				if (findFaceContactTriangle(orderedTriIndices, triIdx, nbPairsPerCM) ||
+					findEdgeContactTriangle(orderedTriIndices, triIdx, nbPairsPerCM))
+				{
+					//printf("triangleIdx: %d; vertexIdx: %d\n", triangleIdx, vertexIdx);
+					needsProcessing = false;
+					break;
+				}
+			}
+		}
+	}
+
+//	printf("triangleIndex %d\n", triangleIndex);
+//	printf("vertexIdx %d\n", vertexIdx);
+//	printf("needsProcessing %d\n", needsProcessing);
+
+	if(!needsProcessing)
+	{
+		//we don't need to generate contacts for this pair. However, we have already generated contacts in
+		//the convexTriangleContactGen kernel. Therefore, we just need to clear the number of contacts in
+		//the corresponding ConvexTriNormalAndIndex
+		cvxni.index = cvxni.index & 0x0ffffff;
+	}
+}
+
+extern "C" __global__
+//__launch_bounds__(NP_TRIMESH_WARPS_PER_BLOCK * WARP_SIZE, 32 / NP_TRIMESH_WARPS_PER_BLOCK)
+void convexTrimeshPostProcess(
+	ConvexMeshPair* PX_RESTRICT cvxTrimeshPair,						// per CM
+	ConvexTriNormalAndIndex** PX_RESTRICT cvxTriNIPtr,				// per cvx-tri
+	ConvexTriIntermediateData** PX_RESTRICT cvxTriIntermPtr,		// per cvx-tri
+	const PxU32** PX_RESTRICT orderedCvxTriIntermPtr,		// per cvx-tri
+	PxU32** PX_RESTRICT cvxTriSecondPassPairsPtr,					// per cvx-tri
+	PxU8* PX_RESTRICT stackPtr,
+	PxU32* PX_RESTRICT nbSecondPassPairs,
+	PxgShape* PX_RESTRICT shapes)
+{
+	const PxU32 nbTpProcess = *nbSecondPassPairs;
+
+	__shared__ uint4* sPairsGPU;
+
+	if (threadIdx.x == 0 && threadIdx.y == 0)
+	{
+		sPairsGPU = reinterpret_cast<uint4 *>(stackPtr);
+	}
+	__syncthreads();
+
+
+
+	for (PxU32 globalThreadIdx = blockIdx.x * blockDim.x + threadIdx.x; globalThreadIdx < nbTpProcess; globalThreadIdx += gridDim.x * blockDim.x)
+	{
+		convexTrimeshPostProcessCore(
+			cvxTrimeshPair,
+			cvxTriNIPtr,
+			cvxTriIntermPtr,
+			orderedCvxTriIntermPtr,
+			cvxTriSecondPassPairsPtr,
+			shapes,
+			sPairsGPU,
+			nbSecondPassPairs,
+			globalThreadIdx);
+	}
+
+}
+
+
+
+//Based on neighbour face checks
+__device__ void convexHeightfieldPostProcessCore(
+	ConvexMeshPair* PX_RESTRICT cvxTrimeshPair,							// per CM
+	ConvexTriNormalAndIndex** PX_RESTRICT cvxTriNIPtr,					// per cvx-tri
+	ConvexTriIntermediateData** PX_RESTRICT cvxTriIntermDataPtr,		// per cvx-tri
+	const PxU32** PX_RESTRICT orderedCvxTriIntermDataPtr,	// per cvx-tri
+	PxU32** PX_RESTRICT cvxTriSecondPassPairsPtr,						// per cvx-tri
+	PxgShape* PX_RESTRICT gpuShapes,
+	const uint4* PX_RESTRICT pairsGPU,
+	PxU32* PX_RESTRICT nbSecondPassPairs,
+	const PxU32 globalThreadIndex)
+{
+	PxU32* cvxTriSecondPassPairs = *cvxTriSecondPassPairsPtr;
+	ConvexTriNormalAndIndex* cvxTriNI = *cvxTriNIPtr;
+
+	const PxU32 pairIdxPlusMask = cvxTriSecondPassPairs[globalThreadIndex];
+	const PxU32 pairIdx = pairIdxPlusMask & 0x1fffffff;
+
+	uint4 curPair = pairsGPU[pairIdx];
+
+	PxU32 cmIdx = curPair.x;
+	PxU32 triangleIdx = curPair.y;
+	PxU32 testOffset = curPair.z;
+	PxU32 shapeID = curPair.w;
+
+	ConvexMeshPair& pair = cvxTrimeshPair[cmIdx];
+	const PxU32 nbPairsPerCM = pair.count;
+	const PxU32 pairStartIndex = pair.startIndex;
+	const PxU32 orderedPairStartIndex = pair.roundedStartIndex;
+	PxU32 convexTriPairOffset = pairStartIndex + testOffset;
+
+	ConvexTriNormalAndIndex& cvxni = cvxTriNI[convexTriPairOffset];
+	//get the number of contacts
+	const PxU32 nbContacts = ConvexTriNormalAndIndex::getNbContacts(cvxni.index);
+
+	if (nbContacts == 0)
+		return;
+
+	PxgShape& heightFieldShape = gpuShapes[shapeID];
+
+	PxVec3 scale = heightFieldShape.scale.scale;
+
+	const PxReal rowScale = scale.x;
+	const PxReal columnScale = scale.z;
+	PxVec3 handedness(1.0f);	// Vector to invert normal coordinates according to the heightfield scales
+	bool wrongHanded = false;
+	if (columnScale < 0.f)
+	{
+		wrongHanded = !wrongHanded;
+		handedness.z = -1.0f;
+	}
+	if (rowScale < 0.f)
+	{
+		wrongHanded = !wrongHanded;
+		handedness.x = -1.0f;
+	}
+
+
+	const PxU32* orderedTriIndices = (*orderedCvxTriIntermDataPtr) + orderedPairStartIndex;
+
+
+	PxU32* heightfieldData = reinterpret_cast<PxU32*>(heightFieldShape.hullOrMeshPtr);
+	const PxU32 nbRows = heightfieldData[0];
+	const PxU32 nbCols = heightfieldData[1];
+	PxHeightFieldSample* samples = reinterpret_cast<PxHeightFieldSample*>(&heightfieldData[2]);
+
+
+	//ConvexTriIntermediateData* cvxTriIntermData = *cvxTriIntermDataPtr;
+
+	//ConvexTriIntermediateData&  cvxTriInterm = cvxTriIntermData[convexTriPairOffset];
+
+	PxU32 tVertexIndices[3];
+	PxU32 adjacencyIndices[3];
+	getTriangleVertexIndices(triangleIdx, tVertexIndices[0], tVertexIndices[1 + wrongHanded], tVertexIndices[2 - wrongHanded], nbCols, samples);
+
+	getTriangleAdjacencyIndices(triangleIdx, adjacencyIndices[wrongHanded ? 2 : 0], adjacencyIndices[1], adjacencyIndices[wrongHanded ? 0 : 2], nbRows, nbCols, samples);
+
+
+	bool needsProcessing = false;
+
+	PxU32 triangleIndex = 0xFFFFFFFF;
+	PxU32 vertexIdx = 0xFFFFFFFF;
+
+	PxU32 mask = pairIdxPlusMask & PxU32(ConvexTriIntermediateData::eMASK);
+
+	if (mask == PxU32(ConvexTriIntermediateData::eV0))
+	{
+		vertexIdx = tVertexIndices[0];
+	}
+	else if (mask == PxU32(ConvexTriIntermediateData::eV1))
+	{
+		vertexIdx = tVertexIndices[1];
+	}
+	else if (mask == PxU32(ConvexTriIntermediateData::eV2))
+	{
+		vertexIdx = tVertexIndices[2];
+	}
+	else if (mask == PxU32(ConvexTriIntermediateData::eE01))
+	{
+		triangleIndex = PxU32(adjacencyIndices[0]);
+	}
+	else if (mask == PxU32(ConvexTriIntermediateData::eE12))
+	{
+		triangleIndex = adjacencyIndices[1];
+	}
+	else if (mask == PxU32(ConvexTriIntermediateData::eE02))
+	{
+		triangleIndex = adjacencyIndices[2];
+	}
+
+	if (triangleIndex != 0xFFFFFFFF)
+	{
+		
+		needsProcessing = isActiveEdge(triangleIndex) && !findTriangle(orderedTriIndices, triangleIndex, nbPairsPerCM);
+	}
+	else if (vertexIdx != 0xFFFFFFFF)
+	{
+		PxU32 cellIdx = triangleIdx / 2;
+
+		PxU32 column = cellIdx % nbCols;
+		PxU32 row = cellIdx / nbCols;
+		PxU32 startRow = PxMax(row, 1u) - 1;
+		PxU32 endRow = PxMin(row, nbRows - 2) + 1;
+		PxU32 startCol = PxMax(column, 1u) - 1;
+		PxU32 endCol = PxMin(column, nbCols - 2) + 1;
+
+		needsProcessing = true;
+
+		for (PxU32 r = startRow; r <= endRow && needsProcessing; r++)
+		{
+			for (PxU32 c = startCol; c <= endCol; ++c)
+			{
+				PxU32 cellIdx = r * nbCols + c;
+
+				PxU32 triIdx = cellIdx * 2;
+				getTriangleVertexIndices(triIdx, tVertexIndices[0], tVertexIndices[1 + wrongHanded], tVertexIndices[2 - wrongHanded], nbCols, samples);
+
+				if (tVertexIndices[0] == vertexIdx || tVertexIndices[1] == vertexIdx || tVertexIndices[2] == vertexIdx)
+				{
+					if (findTriangle(orderedTriIndices, triIdx, nbPairsPerCM))
+					{
+						needsProcessing = false;
+						break;
+					}
+				}
+
+				triIdx = triIdx + 1;
+				getTriangleVertexIndices(triIdx, tVertexIndices[0], tVertexIndices[1 + wrongHanded], tVertexIndices[2 - wrongHanded], nbCols, samples);
+
+				if (tVertexIndices[0] == vertexIdx || tVertexIndices[1] == vertexIdx || tVertexIndices[2] == vertexIdx)
+				{
+					if (findTriangle(orderedTriIndices, triIdx, nbPairsPerCM))
+					{
+						needsProcessing = false;
+						break;
+					}
+				}
+
+			}
+		}
+
+	}
+
+	if (!needsProcessing)
+	{
+		//we don't need to generate contacts for this pair. However, we have already generated contacts in
+		//the convexTriangleContactGen kernel. Therefore, we just need to clear the number of contacts in
+		//the corresponding ConvexTriNormalAndIndex
+		cvxni.index = cvxni.index & 0x0ffffff;
+	}
+
+}
+
+
+extern "C" __global__
+//__launch_bounds__(NP_TRIMESH_WARPS_PER_BLOCK * WARP_SIZE, 32 / NP_TRIMESH_WARPS_PER_BLOCK)
+void convexHeightfieldPostProcess(
+	ConvexMeshPair* PX_RESTRICT cvxTrimeshPair,						// per CM
+	ConvexTriNormalAndIndex** PX_RESTRICT cvxTriNIPtr,				// per cvx-tri
+	ConvexTriIntermediateData** PX_RESTRICT cvxTriIntermPtr,		// per cvx-tri
+	const PxU32** PX_RESTRICT orderedCvxTriIntermPtr,		// per cvx-tri
+	PxU32** PX_RESTRICT cvxTriSecondPassPairsPtr,					// per cvx-tri
+	PxU8* PX_RESTRICT stackPtr,
+	PxU32* PX_RESTRICT nbSecondPassPairs,
+	PxgShape* PX_RESTRICT shapes)
+{
+	__shared__ uint4* sPairsGPU;
+
+	if (threadIdx.x == 0 && threadIdx.y == 0)
+	{
+		sPairsGPU = reinterpret_cast<uint4 *>(stackPtr);
+	}
+	__syncthreads();
+
+	const PxU32 nbTpProcess = *nbSecondPassPairs;
+
+	for (PxU32 globalThreadIdx = blockIdx.x * blockDim.x + threadIdx.x; globalThreadIdx < nbTpProcess; globalThreadIdx += gridDim.x * blockDim.x)
+	{
+		convexHeightfieldPostProcessCore(
+			cvxTrimeshPair,
+			cvxTriNIPtr,
+			cvxTriIntermPtr,
+			orderedCvxTriIntermPtr,
+			cvxTriSecondPassPairsPtr,
+			shapes,
+			sPairsGPU,
+			nbSecondPassPairs,
+			globalThreadIdx);
+	}
+
+}
+
+extern "C" __global__
+//__launch_bounds__(NP_TRIMESH_WARPS_PER_BLOCK * WARP_SIZE, 32 / NP_TRIMESH_WARPS_PER_BLOCK)
+void sortTriangleIndices(
+	ConvexMeshPair* PX_RESTRICT cvxTrimeshPair,	// per CM
+	PxU32** PX_RESTRICT orderedCvxTriIntermPtr,	// per cvx-tri
+	const PxU32 nbPairs
+	)
+{
+	const PxU32 globalWarpIndex = threadIdx.y + blockIdx.x * blockDim.y;
+
+	if (globalWarpIndex >= nbPairs)
+		return;
+
+	PxU32 nbTriangles = cvxTrimeshPair[globalWarpIndex].count;
+	PxU32 startIndex = cvxTrimeshPair[globalWarpIndex].roundedStartIndex;
+
+	//Terminate if we had 0 triangles or we cached them from last frame
+	if (nbTriangles == 0 || nbTriangles == 0xFFFFFFFF)
+		return;
+
+	assert((startIndex & 3) == 0);
+
+	const PxU32 nbUint4s = (nbTriangles + 3) / 4;
+
+	const PxU32 nbWarps = NP_TRIMESH_WARPS_PER_BLOCK;
+
+	radixSortSingleWarpKeysOnly<nbWarps>(reinterpret_cast<uint4*>(*orderedCvxTriIntermPtr + startIndex),
+		nbTriangles, nbUint4s, reinterpret_cast<uint4*>(*orderedCvxTriIntermPtr + startIndex) + nbUint4s, 8);
+}
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/convexTriangle.cuh
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/convexTriangle.cuh
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/cudaBox.cu
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/cudaBox.cu
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/cudaGJKEPA.cu
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/cudaGJKEPA.cu
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/cudaParticleSystem.cu
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/cudaParticleSystem.cu
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/cudaSphere.cu
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/cudaSphere.cu
@@ -0,0 +1,219 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "foundation/PxVec3.h"
+#include "foundation/PxMat34.h"
+#include "foundation/PxTransform.h"
+#include "foundation/PxAssert.h"
+#include <stdio.h>
+
+#include "cuda.h"
+#include "cuda_runtime.h"
+#include "convexFormat.h"
+#include "cudaNpCommon.h"
+
+#include "PxsTransformCache.h"
+#include "PxsContactManagerState.h"
+#include "PxgContactManager.h"
+#include "PxgConvexConvexShape.h"
+#include "PxgPersistentContactManifold.h"
+
+#include "PxgNpKernelIndices.h"
+
+#include "PxsMaterialCore.h"
+
+#include "PxContact.h"
+#include "geometry/PxGeometry.h"
+
+#include "SparseRemove.cuh"
+
+#include "sphereCollision.cuh"
+#include "contactPatchUtils.cuh"
+
+using namespace physx;
+
+extern "C" __host__ void initNarrowphaseKernels9() {}
+
+extern "C" __global__ void sphereNphase_Kernel(
+	PxU32 numTests,
+	const PxReal toleranceLength,
+	const PxgContactManagerInput* PX_RESTRICT cmInputs,
+	PxsContactManagerOutput* PX_RESTRICT cmOutputs,
+	PxgShape* PX_RESTRICT shapes,
+	const PxsCachedTransform* PX_RESTRICT transformCache,
+	const PxReal* contactDistance,
+	const PxsMaterialData* PX_RESTRICT materials,
+	PxU8* PX_RESTRICT contactStream,
+	PxU8* PX_RESTRICT patchStream,
+	PxgPatchAndContactCounters* PX_RESTRICT patchAndContactCounters,
+	PxU32* PX_RESTRICT touchChangeFlags,
+	PxU32* PX_RESTRICT patchChangeFlags,
+	PxU8* PX_RESTRICT startContactPatches,
+	PxU8* PX_RESTRICT startContactPoints,
+	PxU8* PX_RESTRICT startContactForces,
+	PxU32 patchBytesLimit,
+	PxU32 contactBytesLimit,
+	PxU32 forceBytesLimit)
+{
+	PxU32 globalThreadIndex = blockIdx.x * blockDim.x + threadIdx.x;
+
+	if (globalThreadIndex >= numTests)
+		return;
+
+	PxgContactManagerInput contactInput = cmInputs[globalThreadIndex];
+
+	PxU32 transformCacheRef0 = contactInput.transformCacheRef0;
+	PxU32 transformCacheRef1 = contactInput.transformCacheRef1;
+	
+	PxsCachedTransform transformCache0 = transformCache[transformCacheRef0];
+	PxsCachedTransform transformCache1 = transformCache[transformCacheRef1];
+
+	const PxReal cDistance = contactDistance[transformCacheRef0] + contactDistance[transformCacheRef1];
+
+	PxgShape& shape0 = shapes[contactInput.shapeRef0];
+	PxgShape& shape1 = shapes[contactInput.shapeRef1];
+
+	PxGeometryType::Enum type0 = PxGeometryType::Enum(shape0.type);
+	PxGeometryType::Enum type1 = PxGeometryType::Enum(shape1.type);
+
+	PxVec3 scale0 = shape0.scale.scale;
+	PxVec3 scale1 = shape1.scale.scale;
+
+	const bool flip = (type1<type0);
+
+	//shape0 should be always sphere
+	if (flip)
+	{
+		PxSwap(type0, type1);
+		PxSwap(scale0, scale1);
+		PxSwap(transformCache0, transformCache1);
+		//printf("type0 %i type1 %i\n", type0, type1);
+	}
+
+
+	PxTransform& transform0 = transformCache0.transform;
+	PxTransform& transform1 = transformCache1.transform;
+
+	const PxReal sphereRadius = scale0.x;
+
+	float4 pointPen[2];
+	PxVec3 normal;
+	PxU32 nbContacts = 0;
+
+	if (type0 == PxGeometryType::eSPHERE)
+	{
+
+		switch (type1)
+		{
+		case PxGeometryType::eSPHERE:
+		{
+			//printf("sphere/sphere \n");
+			const PxReal sphereRadius1 = scale1.x;
+			nbContacts = spheresphere(transform0, transform1, sphereRadius, sphereRadius1, cDistance, pointPen[0], normal);
+			break;
+		}
+		case PxGeometryType::ePLANE:
+		{
+			//printf("sphere/plane \n");
+			nbContacts = sphereplane(transform0, transform1, sphereRadius, cDistance, pointPen[0], normal);
+			break;
+		}
+		case PxGeometryType::eCAPSULE:
+		{
+			//printf("sphere/capsule \n");
+			const PxReal capsuleRadius = scale1.y;
+			const PxReal halfHeight = scale1.x;
+			nbContacts = spherecapsule(transform0, transform1, sphereRadius, capsuleRadius, halfHeight, cDistance, pointPen[0], normal);
+			break;
+		}
+		case PxGeometryType::eBOX:
+		{
+			//printf("sphere/box \n");
+			const PxVec3 boxHalfExtents = scale1;
+			nbContacts = spherebox(transform0, transform1, sphereRadius, boxHalfExtents, cDistance, pointPen[0], normal);
+			break;
+		}
+		default:
+			break;
+		};
+	}
+	else
+	{
+		assert(type1 == PxGeometryType::eCAPSULE);
+
+		const PxReal capsuleRadius1 = scale1.y;
+		const PxReal capsuleHalfHeight1 = scale1.x;
+
+		switch (type0)
+		{
+		case PxGeometryType::ePLANE:
+		{
+			//printf("sphere/plane \n");
+			nbContacts = planeCapsule(transform0, transform1, capsuleRadius1, capsuleHalfHeight1, cDistance, pointPen, normal);
+			break;
+		}
+		case PxGeometryType::eCAPSULE:
+		{
+			////printf("sphere/capsule \n");
+			const PxReal capsuleRadius0 = scale0.y;
+			const PxReal halfHeight0 = scale0.x;
+			nbContacts = capsuleCapsule(transform0, transform1, capsuleRadius0, halfHeight0,
+				capsuleRadius1, capsuleHalfHeight1, cDistance, pointPen, normal);
+			break;
+		}
+		default:
+			break;
+		};
+
+
+	}
+
+	if (nbContacts && flip)
+		normal = -normal;
+
+	PxU32 contactByteOffset = setContactPointAndForcePointers(cmOutputs, patchAndContactCounters,
+		startContactPoints, startContactForces, contactBytesLimit, forceBytesLimit, globalThreadIndex, nbContacts);
+
+	if (contactByteOffset != 0xFFFFFFFF)
+	{
+		float4* baseContactStream = (float4*)(contactStream + contactByteOffset);
+		for (PxU32 i = 0; i < nbContacts; ++i)
+		{
+			float4& outPointPen = baseContactStream[i];
+			outPointPen = pointPen[i];
+		}
+	}
+
+	PxU32 patchIndex = registerContactPatch(cmOutputs, patchAndContactCounters, touchChangeFlags, patchChangeFlags, startContactPatches, patchBytesLimit, globalThreadIndex, nbContacts);
+
+	insertIntoPatchStream(materials, patchStream, shape0, shape1, patchIndex, normal, nbContacts);
+}
+
+
+
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/dataReadWriteHelper.cuh
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/dataReadWriteHelper.cuh
@@ -0,0 +1,396 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __DATA_RW_HELPER_CUH__
+#define __DATA_RW_HELPER_CUH__
+
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxBounds3.h"
+#include "PxgContactManager.h"
+#include "AlignedTransform.h"
+#include "PxsTransformCache.h"
+#include "PxgConvexConvexShape.h"
+#include "convexNpCommon.h"
+
+#include "PxgCommonDefines.h"
+#include "geometry/PxGeometry.h"
+#include "utils.cuh"
+#include "PxgSolverCoreDesc.h"
+#include "PxgArticulationCoreDesc.h"
+
+using namespace physx;
+
+struct PxgVelocityPackPGS
+{
+	PxVec3 linVel;
+	PxVec3 angVel;
+};
+
+struct PxgVelocityPackTGS
+{
+	PxVec3 linVel;
+	PxVec3 angVel;
+	PxVec3 linDelta;
+	PxVec3 angDelta;
+};
+
+union PxgVelocityPack
+{
+	PxgVelocityPackTGS tgs;
+	PxgVelocityPackPGS pgs;
+
+	PX_FORCE_INLINE __device__ PxgVelocityPack() {}
+};
+
+struct PxgVelocityReader
+{
+	//Shared
+	const PxU32* const solverBodyIndices;
+
+	//Articulation
+	const float4* const PX_RESTRICT artiLinkVelocities;
+	const PxU32 maxLinks;
+	const PxU32 nbArticulations;
+	const PxU32 artiOffset;
+
+	//Rigid
+	const PxU32 numSolverBodies;
+	const float4* const PX_RESTRICT solverBodyDeltaVel;
+	const float4* const PX_RESTRICT initialVel;
+	
+	template <typename IterativeData>
+	PX_FORCE_INLINE __device__ PxgVelocityReader(const PxgPrePrepDesc* preDesc, const PxgSolverCoreDesc* solverCoreDesc, const PxgArticulationCoreDesc* artiCoreDesc,
+		const PxgSolverSharedDesc<IterativeData>* sharedDesc, PxU32 numSolverBodies) :
+		solverBodyIndices(preDesc->solverBodyIndices),
+		artiLinkVelocities(solverCoreDesc->outArtiVelocity),
+		maxLinks(artiCoreDesc->mMaxLinksPerArticulation),
+		nbArticulations(artiCoreDesc->nbArticulations),
+		artiOffset(maxLinks * nbArticulations),
+		solverBodyDeltaVel(sharedDesc->iterativeData.solverBodyVelPool + solverCoreDesc->accumulatedBodyDeltaVOffset),
+		initialVel(solverCoreDesc->outSolverVelocity),
+		numSolverBodies(numSolverBodies)
+	{
+	}
+
+	template<typename Pack>
+	PX_FORCE_INLINE __device__ PxU32 readVelocitiesPGS(const PxNodeIndex rigidId, Pack& result)
+	{
+		const PxU32 solverBodyIdx = rigidId.isStaticBody() ? 0 : solverBodyIndices[rigidId.index()];
+
+		if (rigidId.isArticulation())
+		{
+			const PxU32 index = solverBodyIdx * maxLinks;
+			const float4* vels = &artiLinkVelocities[index];
+			const PxU32 linkID = rigidId.articulationLinkId();
+
+			result.linVel = PxLoad3(vels[linkID]);
+			result.angVel = PxLoad3(vels[linkID + artiOffset]);			
+		}
+		else
+		{
+			result.linVel = PxLoad3(initialVel[solverBodyIdx]) + PxLoad3(solverBodyDeltaVel[solverBodyIdx]);
+			result.angVel = PxLoad3(initialVel[solverBodyIdx + numSolverBodies]) + PxLoad3(solverBodyDeltaVel[solverBodyIdx + numSolverBodies]);			
+		}
+		return solverBodyIdx;
+	}
+
+	PX_FORCE_INLINE __device__ PxU32 readVelocitiesTGS(const PxNodeIndex rigidId, PxgVelocityPackTGS& result)
+	{
+		const PxU32 solverBodyIdx = rigidId.isStaticBody() ? 0 : solverBodyIndices[rigidId.index()];
+
+		if (rigidId.isArticulation())
+		{
+			const PxU32 index = solverBodyIdx * maxLinks;
+			const float4* vels = &artiLinkVelocities[index];
+			const PxU32 linkID = rigidId.articulationLinkId();
+
+			float4 linearVelocity = vels[linkID];
+			float4 angularVelocity = vels[linkID + artiOffset];
+			float4 linDelta = vels[linkID + artiOffset + artiOffset];
+			float4 angDelta = vels[linkID + artiOffset + artiOffset + artiOffset];
+
+			result.linVel = PxVec3(linearVelocity.x, linearVelocity.y, linearVelocity.z);
+			result.angVel = PxVec3(angularVelocity.x, angularVelocity.y, angularVelocity.z);
+			result.linDelta = PxVec3(linDelta.x, linDelta.y, linDelta.z);
+			result.angDelta = PxVec3(angDelta.x, angDelta.y, angDelta.z);						
+		}
+		else
+		{
+			float4 vel0 = solverBodyDeltaVel[solverBodyIdx];
+			float4 vel1 = solverBodyDeltaVel[solverBodyIdx + numSolverBodies];
+			float4 vel2 = solverBodyDeltaVel[solverBodyIdx + numSolverBodies + numSolverBodies];
+
+			result.linVel = PxVec3(vel0.x, vel0.y, vel0.z);
+			result.angVel = PxVec3(vel0.w, vel1.x, vel1.y);
+			result.linDelta = PxVec3(vel1.z, vel1.w, vel2.x);
+			result.angDelta = PxVec3(vel2.y, vel2.z, vel2.w);
+		}
+		return solverBodyIdx;
+	}
+};
+
+__device__ inline static
+void PxgContactManagerInput_ReadWarp(PxgContactManagerInput & res, const PxgContactManagerInput * ptr, PxU32 offset)
+{
+	const PxU32 laneIdx = threadIdx.x & (WARP_SIZE - 1);
+
+	PxU32 input;
+	if (laneIdx < 4)
+		input = reinterpret_cast<const PxU32 *>(ptr + offset)[laneIdx];
+
+	res.shapeRef0 = __shfl_sync(FULL_MASK, (input), 0);
+	res.shapeRef1 = __shfl_sync(FULL_MASK, (input), 1);
+	res.transformCacheRef0 = __shfl_sync(FULL_MASK, (input), 2);
+	res.transformCacheRef1 = __shfl_sync(FULL_MASK, (input), 3);
+}
+
+__device__ inline static
+void PxAlignedTransform_ReadWarp(PxAlignedTransform& res, const PxAlignedTransform* ptr)
+{
+	const PxU32 laneIdx = threadIdx.x & (WARP_SIZE - 1);
+
+	PxReal input;
+	if (laneIdx < 8)
+		input = reinterpret_cast<const PxReal *>(ptr)[laneIdx];
+
+	res.q.q.x = __shfl_sync(FULL_MASK, input, 0);
+	res.q.q.y = __shfl_sync(FULL_MASK, input, 1);
+	res.q.q.z = __shfl_sync(FULL_MASK, input, 2);
+	res.q.q.w = __shfl_sync(FULL_MASK, input, 3);
+	res.p.x = __shfl_sync(FULL_MASK, input, 4);
+	res.p.y = __shfl_sync(FULL_MASK, input, 5);
+	res.p.z = __shfl_sync(FULL_MASK, input, 6);
+	res.p.w = __shfl_sync(FULL_MASK, input, 7);
+}
+
+__device__ inline static
+void PxsCachedTransform_ReadWarp(PxsCachedTransform& res, const PxsCachedTransform* ptr)
+{
+	const PxU32 laneIdx = threadIdx.x & (WARP_SIZE - 1);
+
+	PxReal input;
+	if (laneIdx < sizeof(PxsCachedTransform)/sizeof(PxReal))
+		input = reinterpret_cast<const PxReal *>(ptr)[laneIdx];
+
+	res.transform.q.x = __shfl_sync(FULL_MASK, input, 0);
+	res.transform.q.y = __shfl_sync(FULL_MASK, input, 1);
+	res.transform.q.z = __shfl_sync(FULL_MASK, input, 2);
+	res.transform.q.w = __shfl_sync(FULL_MASK, input, 3);
+	res.transform.p.x = __shfl_sync(FULL_MASK, input, 4);
+	res.transform.p.y = __shfl_sync(FULL_MASK, input, 5);
+	res.transform.p.z = __shfl_sync(FULL_MASK, input, 6);
+	res.flags = __shfl_sync(FULL_MASK, __float_as_int(input), 7);
+}
+
+__device__ inline static
+void PxBounds3_ReadWarp(PxBounds3& res, const PxBounds3* ptr)
+{
+	const PxU32 laneIdx = threadIdx.x & (WARP_SIZE - 1);
+
+	PxReal input;
+	if (laneIdx < sizeof(PxBounds3) /sizeof(PxReal))
+		input = reinterpret_cast<const PxReal *>(ptr)[laneIdx];
+
+	res.minimum.x = __shfl_sync(FULL_MASK, input, 0);
+	res.minimum.y = __shfl_sync(FULL_MASK, input, 1);
+	res.minimum.z = __shfl_sync(FULL_MASK, input, 2);
+	res.maximum.x = __shfl_sync(FULL_MASK, input, 3);
+	res.maximum.y = __shfl_sync(FULL_MASK, input, 4);
+	res.maximum.z = __shfl_sync(FULL_MASK, input, 5);
+
+}
+
+__device__ inline static
+void PxsCachedTransform_WriteWarp(PxsCachedTransform* ptr, const PxsCachedTransform& inp)
+{
+	const PxU32 laneIdx = threadIdx.x & (WARP_SIZE - 1);
+
+	if (laneIdx < 8)
+	{
+		uint input = reinterpret_cast<const uint *>(&inp)[laneIdx];
+		reinterpret_cast<uint *>(ptr)[laneIdx] = input;
+	}
+}
+
+__device__ inline static
+void PxAlignedTransform_WriteWarp(PxAlignedTransform* ptr, const PxAlignedTransform& inp)
+{
+	const PxU32 laneIdx = threadIdx.x & (WARP_SIZE - 1);
+
+	if (laneIdx < 8)
+	{
+		uint input = reinterpret_cast<const uint *>(&inp)[laneIdx];
+		reinterpret_cast<uint *>(ptr)[laneIdx] = input;
+	}
+}
+
+__device__ inline static
+void PxgShape_ReadWarp(PxgShape& res, const PxgShape* ptr)
+{
+	const PxU32 laneIdx = threadIdx.x & (WARP_SIZE - 1);
+
+	PxReal input;
+	if (laneIdx < 11 + (sizeof(size_t) == 8) * 1)
+	{
+		input = reinterpret_cast<const PxReal *>(ptr)[laneIdx];
+	}
+
+	res.scale.scale.x = __shfl_sync(FULL_MASK, input, 0);
+	res.scale.scale.y = __shfl_sync(FULL_MASK, input, 1);
+	res.scale.scale.z = __shfl_sync(FULL_MASK, input, 2);
+	res.scale.rotation.x = __shfl_sync(FULL_MASK, input, 3);
+	res.scale.rotation.y = __shfl_sync(FULL_MASK, input, 4);
+	res.scale.rotation.z = __shfl_sync(FULL_MASK, input, 5);
+	res.scale.rotation.w = __shfl_sync(FULL_MASK, input, 6);
+	res.materialIndex = __shfl_sync(FULL_MASK, __float_as_uint(input), 7);
+
+	size_t hullOrMeshPtr = __shfl_sync(FULL_MASK, __float_as_uint(input), 8);
+
+	if (sizeof(size_t) == 8)
+	{
+		const PxU64 hullPtr_hi = __shfl_sync(FULL_MASK, __float_as_uint(input), 9);
+		hullOrMeshPtr |= hullPtr_hi << 32ll;
+	}
+	res.hullOrMeshPtr = hullOrMeshPtr;
+
+	res.type = __shfl_sync(FULL_MASK, __float_as_uint(input), 9 + (sizeof(size_t) == 8) * 1);
+	res.particleOrSoftbodyId = __shfl_sync(FULL_MASK, __float_as_uint(input), 10 + (sizeof(size_t) == 8) * 1);
+}
+
+/*
+ * Loads a PxgShape pair and corresponding transform cache refs.
+ * Loads ExpectedType0 into shape0. It is assumed that one of the shapes is actually of ExpectedType0.
+ * The function asserts that the shape0 result actually is of type ExpectedType0 and that shape1 is of type
+ * ExpectedType1 if it was specified.
+ */
+template <PxGeometryType::Enum ExpectedType0, PxGeometryType::Enum ExpectedType1 = PxGeometryType::eINVALID>
+PX_FORCE_INLINE __device__ void LoadShapePair(
+	const PxgContactManagerInput* PX_RESTRICT cmInputs,
+	PxU32 cmIdx,
+	const PxgShape* PX_RESTRICT gpuShapes,
+	PxgShape& shape0, //output
+	PxU32& transformCacheRef0, //output
+	PxgShape& shape1, //output
+	PxU32& transformCacheRef1 //output
+)
+{
+	PxgContactManagerInput cmInput = cmInputs[cmIdx];
+	const PxgShape* srcShape0 = gpuShapes + cmInput.shapeRef0;
+	const PxgShape* srcShape1 = gpuShapes + cmInput.shapeRef1;
+	PxU32 srcCacheRef0 = cmInput.transformCacheRef0;
+	PxU32 srcCacheRef1 = cmInput.transformCacheRef1;
+
+	if (ExpectedType0 != ExpectedType1)
+	{
+		PxGeometryType::Enum type0 = PxGeometryType::Enum(srcShape0->type);
+		if (type0 != ExpectedType0)
+		{
+			PxSwap(srcShape0, srcShape1);
+			PxSwap(srcCacheRef0, srcCacheRef1);
+		}
+	}
+
+	shape0 = *srcShape0;
+	shape1 = *srcShape1;
+	transformCacheRef0 = srcCacheRef0;
+	transformCacheRef1 = srcCacheRef1;
+
+	assert(shape0.type == ExpectedType0);
+	assert(ExpectedType1 == PxGeometryType::eINVALID || shape1.type == ExpectedType1);
+}
+
+/*
+* Loads a PxgShape pair and corresponding transform cache refs. Full Warp needs to participate.
+* Loads ExpectedType0 into shape0. It is assumed that one of the shapes is actually of ExpectedType0.
+* The function asserts that the shape0 result actually is of type ExpectedType0 and that shape1 is of type
+* ExpectedType1 if it was specified.
+*/
+template <PxGeometryType::Enum ExpectedType0, PxGeometryType::Enum ExpectedType1 = PxGeometryType::eINVALID>
+PX_FORCE_INLINE __device__ void LoadShapePairWarp(
+	const PxgContactManagerInput* PX_RESTRICT cmInputs,
+	PxU32 cmIdx,
+	const PxgShape* PX_RESTRICT gpuShapes,
+	PxgShape& shape0, //output
+	PxU32& transformCacheRef0, //output
+	PxgShape& shape1, //output
+	PxU32& transformCacheRef1 //output
+)
+{
+	PxgContactManagerInput cmInput;
+	PxgContactManagerInput_ReadWarp(cmInput, cmInputs, cmIdx);
+	const PxgShape* srcShape0 = gpuShapes + cmInput.shapeRef0;
+	const PxgShape* srcShape1 = gpuShapes + cmInput.shapeRef1;
+	PxU32 srcCacheRef0 = cmInput.transformCacheRef0;
+	PxU32 srcCacheRef1 = cmInput.transformCacheRef1;
+
+	if (ExpectedType0 != ExpectedType1)
+	{
+		PxGeometryType::Enum type0 = PxGeometryType::Enum(srcShape0->type);
+		if (type0 != ExpectedType0)
+		{
+			PxSwap(srcShape0, srcShape1);
+			PxSwap(srcCacheRef0, srcCacheRef1);
+		}
+	}
+
+	PxgShape_ReadWarp(shape0, srcShape0);
+	PxgShape_ReadWarp(shape1, srcShape1);
+	transformCacheRef0 = srcCacheRef0;
+	transformCacheRef1 = srcCacheRef1;
+
+	assert(shape0.type == ExpectedType0);
+	assert(ExpectedType1 == PxGeometryType::eINVALID || shape1.type == ExpectedType1);
+}
+
+__device__ inline static
+void ConvexTriNormalAndIndex_WriteWarp(ConvexTriNormalAndIndex* outp, const ConvexTriNormalAndIndex& inp)
+{
+	const PxU32 laneIdx = threadIdx.x & (WARP_SIZE - 1);
+
+	if (laneIdx < 4)
+	{
+		uint input = reinterpret_cast<const uint *>(&inp)[laneIdx];
+		reinterpret_cast<uint *>(outp)[laneIdx] = input;
+	}
+
+}
+
+__device__ inline static
+void ConvexMeshPair_WriteWarp(ConvexMeshPair* outp, const ConvexMeshPair& inp)
+{
+	const PxU32 laneIdx = threadIdx.x & (WARP_SIZE - 1);
+
+	if (laneIdx < (sizeof(ConvexMeshPair)/sizeof(uint)))
+	{
+		uint input = reinterpret_cast<const uint *>(&inp)[laneIdx];
+		reinterpret_cast<uint *>(outp)[laneIdx] = input;
+	}
+
+}
+
+#endif
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/deformableCollision.cuh
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/deformableCollision.cuh
@@ -0,0 +1,380 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef __DEFORMABLE_COLLISION_CUH__
+#define __DEFORMABLE_COLLISION_CUH__
+
+#include "foundation/PxVec2.h"
+#include "foundation/PxVecMath.h"
+#include "GuDistancePointTriangle.h"
+#include "PxgFEMCloth.h"
+
+#define DEFORMABLE_BARYCENTRIC_THRESHOLD (1.0e-6f)
+#define DEFORMABLE_ONE_MINUS_BARYCENTRIC_THRESHOLD (1.0f - DEFORMABLE_BARYCENTRIC_THRESHOLD)
+
+PX_FORCE_INLINE __device__ PxBounds3 triBoundingBox(const PxVec3& worldV0, const PxVec3& worldV1, const PxVec3& worldV2)
+{
+	PxBounds3 triangleBound;
+
+	PxReal tX0 = PxMin(worldV0.x, worldV1.x);
+	PxReal tY0 = PxMin(worldV0.y, worldV1.y);
+	PxReal tZ0 = PxMin(worldV0.z, worldV1.z);
+
+	triangleBound.minimum.x = PxMin(tX0, worldV2.x);
+	triangleBound.minimum.y = PxMin(tY0, worldV2.y);
+	triangleBound.minimum.z = PxMin(tZ0, worldV2.z);
+
+	// compute max
+	tX0 = PxMax(worldV0.x, worldV1.x);
+	tY0 = PxMax(worldV0.y, worldV1.y);
+	tZ0 = PxMax(worldV0.z, worldV1.z);
+
+	triangleBound.maximum.x = PxMax(tX0, worldV2.x);
+	triangleBound.maximum.y = PxMax(tY0, worldV2.y);
+	triangleBound.maximum.z = PxMax(tZ0, worldV2.z);
+
+	return triangleBound;
+}
+
+PX_FORCE_INLINE __device__ PxBounds3 tetBoundingBox(const PxVec3& worldV0, const PxVec3& worldV1, const PxVec3& worldV2, const PxVec3& worldV3)
+{
+	PxBounds3 result;
+
+	PxReal tX0 = PxMin(worldV0.x, worldV1.x);
+	PxReal tY0 = PxMin(worldV0.y, worldV1.y);
+	PxReal tZ0 = PxMin(worldV0.z, worldV1.z);
+
+	PxReal tX1 = PxMin(worldV2.x, worldV3.x);
+	PxReal tY1 = PxMin(worldV2.y, worldV3.y);
+	PxReal tZ1 = PxMin(worldV2.z, worldV3.z);
+
+	result.minimum.x = PxMin(tX0, tX1);
+	result.minimum.y = PxMin(tY0, tY1);
+	result.minimum.z = PxMin(tZ0, tZ1);
+
+	// compute max
+	tX0 = PxMax(worldV0.x, worldV1.x);
+	tY0 = PxMax(worldV0.y, worldV1.y);
+	tZ0 = PxMax(worldV0.z, worldV1.z);
+
+	tX1 = PxMax(worldV2.x, worldV3.x);
+	tY1 = PxMax(worldV2.y, worldV3.y);
+	tZ1 = PxMax(worldV2.z, worldV3.z);
+
+	result.maximum.x = PxMax(tX0, tX1);
+	result.maximum.y = PxMax(tY0, tY1);
+	result.maximum.z = PxMax(tZ0, tZ1);
+
+	return result;
+}
+
+__device__ inline bool computeTetBarycentric(const PxVec3& a, const PxVec3& b, const PxVec3& c, const PxVec3& d, const PxVec3& point,
+											 float4& barycentric, const PxReal eps = 1e-5f)
+{
+	const PxReal minEps = -eps;
+	const PxReal maxEps = 1.f + eps;
+	const PxVec3 ba = b - a;
+	const PxVec3 ca = c - a;
+	const PxVec3 da = d - a;
+	const PxVec3 pa = point - a;
+
+	// PxMat33 bcd(ba, ca, da);
+	// const PxReal detBcd = bcd.getDeterminant();
+	const PxReal detBcd = ba.dot(ca.cross(da));
+
+	/*PxMat33 pcd(pa, ca, da);
+	const PxReal detPcd = pcd.getDeterminant();*/
+	const PxReal detPcd = pa.dot(ca.cross(da));
+
+	const PxReal v = detPcd / detBcd;
+
+	/*PxMat33 bpd(ba, pa, da);
+	const PxReal detBpd = bpd.getDeterminant();*/
+	const PxReal detBpd = ba.dot(pa.cross(da));
+	const PxReal w = detBpd / detBcd;
+
+	/*PxMat33 bcp(ba, ca, pa);
+	const PxReal detBcp = bcp.getDeterminant();*/
+	const PxReal detBcp = ba.dot(ca.cross(pa));
+
+	const PxReal x = detBcp / detBcd;
+
+	const PxReal u = 1.f - v - w - x;
+	barycentric = make_float4(u, v, w, x);
+
+	// After clamping the barycentrics might not sum up to 1 anymore
+	// Clamping should be done because otherwise slightly negative barycentrics can lead to a negative inverse mass, e. g.
+	// when 3 vertices of a tetrahedron have inverse mass zero and the forth vertex happens to get a negative barycentric coordinate
+	barycentric.x = PxClamp(barycentric.x, 0.0f, 1.0f);
+	barycentric.y = PxClamp(barycentric.y, 0.0f, 1.0f);
+	barycentric.z = PxClamp(barycentric.z, 0.0f, 1.0f);
+	barycentric.w = PxClamp(barycentric.w, 0.0f, 1.0f);
+
+	return (u >= minEps && u <= maxEps) && (v >= minEps && v <= maxEps) && (w >= minEps && w <= maxEps) && (x >= minEps && x <= maxEps);
+}
+
+__device__ inline bool computeTetBarycentric(PxVec3* verts, const PxVec3& point, float4& barycentric, const float eps = 1e-5f)
+{
+	return computeTetBarycentric(verts[0], verts[1], verts[2], verts[3], point, barycentric, eps);
+}
+
+__device__ inline PxVec3 closestPtPointTriangle(const PxVec3& p, const PxVec3& a, const PxVec3& b, const PxVec3& c, float& s, float& t)
+{
+	// Check if P in vertex region outside A
+	const PxVec3 ab = b - a;
+	const PxVec3 ac = c - a;
+	const PxVec3 ap = p - a;
+
+	const float d1 = ab.dot(ap);
+	const float d2 = ac.dot(ap);
+	if(d1 <= 0.0f && d2 <= 0.0f)
+	{
+		s = 0.0f;
+		t = 0.0f;
+		return a; // Barycentric coords 1,0,0
+	}
+
+	// Check if P in vertex region outside B
+	const PxVec3 bp = p - b;
+	const float d3 = ab.dot(bp);
+	const float d4 = ac.dot(bp);
+	if(d3 >= 0.0f && d4 <= d3)
+	{
+		s = 1.0f;
+		t = 0.0f;
+		return b; // Barycentric coords 0,1,0
+	}
+
+	// Check if P in edge region of AB, if so return projection of P onto AB
+	const float vc = d1 * d4 - d3 * d2;
+	if(vc <= 0.0f && d1 >= 0.0f && d3 <= 0.0f)
+	{
+		const float v = d1 / (d1 - d3);
+		s = v;
+		t = 0.0f;
+		return a + v * ab; // barycentric coords (1-v, v, 0)
+	}
+
+	// Check if P in vertex region outside C
+	const PxVec3 cp = p - c;
+	const float d5 = ab.dot(cp);
+	const float d6 = ac.dot(cp);
+	if(d6 >= 0.0f && d5 <= d6)
+	{
+		s = 0.0f;
+		t = 1.0f;
+		return c; // Barycentric coords 0,0,1
+	}
+
+	// Check if P in edge region of AC, if so return projection of P onto AC
+	const float vb = d5 * d2 - d1 * d6;
+	if(vb <= 0.0f && d2 >= 0.0f && d6 <= 0.0f)
+	{
+		const float w = d2 / (d2 - d6);
+		s = 0.0f;
+		t = w;
+		return a + w * ac; // barycentric coords (1-w, 0, w)
+	}
+
+	// Check if P in edge region of BC, if so return projection of P onto BC
+	const float va = d3 * d6 - d5 * d4;
+	if(va <= 0.0f && (d4 - d3) >= 0.0f && (d5 - d6) >= 0.0f)
+	{
+		const float w = (d4 - d3) / ((d4 - d3) + (d5 - d6));
+		s = 1.0f - w;
+		t = w;
+		return b + w * (c - b); // barycentric coords (0, 1-w, w)
+	}
+
+	// P inside face region. Compute Q through its barycentric coords (u,v,w)
+	const float denom = 1.0f / (va + vb + vc);
+	const float v = vb * denom;
+	const float w = vc * denom;
+	s = v;
+	t = w;
+	return a + ab * v + ac * w;
+}
+
+__device__ static PxReal distancePointTriangleSquared(const PxVec3 p, const PxVec3 a, const PxVec3 b, const PxVec3 c, PxVec3& closestP)
+{
+	const PxVec3 ab = b - a;
+	const PxVec3 ac = c - a;
+	closestP = Gu::closestPtPointTriangle2(p, a, b, c, ab, ac);
+	return (p - closestP).magnitudeSquared();
+}
+
+__device__ static void closestPtPointEdge(const PxVec3& v0, const PxVec3& ev0, const PxVec3& ev1, PxReal& w)
+{
+	const PxVec3 r = ev1 - ev0;
+	const PxReal d2 = r.dot(r);
+
+	w = (d2 > 0.0f) ? PxClamp(r.dot(v0 - ev0) / d2, 0.0f, 1.0f) : 0.5f;
+}
+
+// Computes the closest points on two infinite lines.
+// Returns false if the edges are parallel or degenerate.
+// Outputs:
+//   - s, t: parametric positions along edge0 and edge1 (can be negative for infinite lines)
+//   - distSq: squared distance between the two closest points
+PX_FORCE_INLINE __device__ static bool closestPtLineLine(const PxVec3& e0v0, const PxVec3& e0v1, const PxVec3& e1v0, const PxVec3& e1v1,
+														 PxReal& s, PxReal& t, PxReal& distSq)
+{
+	const PxVec3 r0 = e0v1 - e0v0; // Edge0 direction
+	const PxVec3 r1 = e1v1 - e1v0; // Edge1 direction
+	const PxVec3 d = e1v0 - e0v0;  // Displacement between starting points
+
+	const PxReal r0Len2 = r0.magnitudeSquared();
+	const PxReal r1Len2 = r1.magnitudeSquared();
+
+	// Handle degenerate edges: If either edge has zero length, return midpoints
+	if(r0Len2 < 1.0e-18f | r1Len2 < 1.0e-18f)
+	{
+		s = t = 0.0f;
+		distSq = d.magnitudeSquared();
+		return false;
+	}
+
+	const PxReal r0Len = PxSqrt(r0Len2);
+	const PxReal r1Len = PxSqrt(r1Len2);
+
+	const PxVec3 u0 = r0 / r0Len;
+	const PxVec3 u1 = r1 / r1Len;
+
+	const PxReal cosTheta = PxAbs(u0.dot(u1));
+
+	// Edges are nearly parallel (angle < 0.81).
+	if(cosTheta > 0.9999f)
+	{
+		// Pick point e0v0 on edge0 and project to edge1
+		PxReal tProj = (e0v0 - e1v0).dot(r1) / r1Len2;
+		t = tProj;
+		s = 0.0f;
+
+		PxVec3 pt0 = e0v0;
+		PxVec3 pt1 = e1v0 + r1 * tProj;
+		distSq = (pt1 - pt0).magnitudeSquared();
+		return false;
+	}
+
+	// PxReal a00 = 1.0f;
+	// PxReal a11 = 1.0f;
+
+	PxReal a01 = -u0.dot(u1);
+	PxReal b0 = u0.dot(d) / r0Len;
+	PxReal b1 = -u1.dot(d) / r1Len;
+
+	PxReal det = 1.0f - a01 * a01;
+
+	if(det < 1.0e-9f)
+	{
+		s = t = 0.0f;
+		distSq = d.magnitudeSquared();
+		return false;
+	}
+
+	// Compute actual closest points
+	const PxReal invDet = 1.0f / det;
+	s = (b0 - b1 * a01) * invDet;
+	t = (b1 - a01 * b0) * invDet;
+
+	const PxVec3 pt0 = e0v0 + s * r0;
+	const PxVec3 pt1 = e1v0 + t * r1;
+	distSq = (pt1 - pt0).magnitudeSquared();
+
+	return true;
+}
+
+// Computes the closest points on two edges (finite edges)
+PX_FORCE_INLINE __device__ static bool closestPtEdgeEdge(const PxVec3& e0v0, const PxVec3& e0v1, const PxVec3& e1v0, const PxVec3& e1v1,
+														 PxReal& s, PxReal& t, PxReal& distSq)
+{
+	if(!closestPtLineLine(e0v0, e0v1, e1v0, e1v1, s, t, distSq))
+	{
+		return false; // Degenerate edges
+	}
+
+	// Falls back to point edge distance.
+	if(s<0.0f | s> 1.0f | t<0.0f | t> 1.0f)
+	{
+		PxReal w[4];
+		closestPtPointEdge(e0v0, e1v0, e1v1, w[0]);
+		closestPtPointEdge(e0v1, e1v0, e1v1, w[1]);
+		closestPtPointEdge(e1v0, e0v0, e0v1, w[2]);
+		closestPtPointEdge(e1v1, e0v0, e0v1, w[3]);
+
+		// Candidates for closest point selection
+		const PxReal candidates[4][2] = { { 0.0f, w[0] }, { 1.0f, w[1] }, { w[2], 0.0f }, { w[3], 1.0f } };
+
+		PxI32 minIndex = 0;
+		distSq = PX_MAX_REAL;
+
+#pragma unroll
+		// Compare the four closest points and select the minimum distance pair.
+		for(int i = 0; i < 4; ++i)
+		{
+			const PxReal sC = candidates[i][0];
+			const PxReal tC = candidates[i][1];
+			const PxVec3 p0 = e0v0 * (1.0f - sC) + e0v1 * sC;
+			const PxVec3 p1 = e1v0 * (1.0f - tC) + e1v1 * tC;
+			const PxReal d2 = (p0 - p1).magnitudeSquared();
+			minIndex = (d2 < distSq) ? i : minIndex;
+			distSq = PxMin(d2, distSq);
+		}
+
+		// Assign the best closest point.
+		assert(minIndex != -1);
+		s = candidates[minIndex][0];
+		t = candidates[minIndex][1];
+	}
+
+	return true;
+}
+
+// Type0: minimal triangle set covering all edges and vertices (compact encoding)
+// Type1: more uniformly distributed edges and vertices across triangles (balanced encoding)
+PX_FORCE_INLINE __device__ static bool isType0EdgeActive(PxU32 edgeAuthorship, PxU32 localEdgeIndex)
+{
+	return (edgeAuthorship & (1U << (EdgeEncoding::TYPE0_EDGE_BASE_POS + localEdgeIndex))) != 0;
+}
+
+PX_FORCE_INLINE __device__ static bool isType1EdgeActive(PxU32 edgeAuthorship, PxU32 localEdgeIndex)
+{
+	return (edgeAuthorship & (1U << (EdgeEncoding::TYPE1_EDGE_BASE_POS + localEdgeIndex))) != 0;
+}
+
+PX_FORCE_INLINE __device__ static bool isType0VertexActive(PxU32 edgeAuthorship, PxU32 localVertexIndex)
+{
+	return (edgeAuthorship & (1U << (EdgeEncoding::TYPE0_VERTEX0_ACTIVE_POS + localVertexIndex))) != 0;
+}
+
+PX_FORCE_INLINE __device__ static bool isType1VertexActive(PxU32 edgeAuthorship, PxU32 localVertexIndex)
+{
+	return (edgeAuthorship & (1U << (EdgeEncoding::TYPE1_VERTEX0_ACTIVE_POS + localVertexIndex))) != 0;
+}
+
+#endif // __DEFORMABLE_COLLISION_CUH__
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/deformableElementFilter.cuh
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/deformableElementFilter.cuh
@@ -0,0 +1,180 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __DEFORMABLE_ELEMENT_FILTER_CUH__
+#define __DEFORMABLE_ELEMENT_FILTER_CUH__
+
+#include "foundation/PxSimpleTypes.h"
+#include "PxFiltering.h"
+#include "PxgSimulationCoreDesc.h"
+#include "PxgParticleSystem.h"
+
+using namespace physx;
+
+static __device__ bool find(PxParticleRigidFilterPair* pairs, const PxU32 nbPairs, PxParticleRigidFilterPair& pair)
+{
+	//Binary search...
+	PxU32 l = 0, r = nbPairs;
+
+	while (l < r)
+	{
+		PxU32 mid = (l + r) / 2;
+		const PxParticleRigidFilterPair& p = pairs[mid];
+
+		if (pair == p)
+			return true;
+		else if (pair < p)
+			r = mid;
+		else
+			l = mid + 1;
+	}
+	return false;
+}
+
+static __device__ bool find(PxParticleRigidFilterPair* pairs, const PxU32 nbPairs, PxU64 rigidId, PxU32 particleId)
+{
+	PxParticleRigidFilterPair pair;
+	pair.mID0 = rigidId;
+	pair.mID1 = particleId;
+	return find(pairs, nbPairs, pair);
+}
+
+static __device__ bool find(const PxgRigidFilterPair* pairs, const PxU32 nbPairs, PxgRigidFilterPair& pair, bool ignoreIndex2)
+{
+	//Binary search...
+	PxU32 l = 0, r = nbPairs;
+
+	while (l < r)
+	{
+		PxU32 mid = (l + r) / 2;
+		const PxgRigidFilterPair& p = pairs[mid];
+		PxI32 cmp = pair.compare(p);
+
+		if (cmp == 0 || (ignoreIndex2 && pair.index0 == p.index0 && pair.index1 == p.index1))
+			return true;
+		if (cmp < 0)
+			r = mid;
+		else
+			l = mid + 1;
+
+	}
+	return false;
+}
+
+static __device__ bool find(const PxgRigidFilterPair* pairs, const PxU32 nbPairs, PxU64 rigidId, PxU32 compressedParticleId)
+{
+	PxgRigidFilterPair pair;
+	pair.index0 = rigidId;
+	pair.index1 = compressedParticleId;
+	pair.index2 = 0;
+	return find(pairs, nbPairs, pair, true);
+}
+
+
+
+static __device__ PxU32 findRange(PxU32 value, PxU32* values, PxU32 length)
+{
+	if (length == 0)
+		return 0;
+
+	PxU32 l = 0, r = length;
+	while (l < r)
+	{
+		PxU32 m = (l + r) / 2;
+		if (values[m] > value)
+			r = m;
+		else
+			l = m + 1;
+
+	}
+	return r-1;
+}
+
+static __device__ bool find(PxgParticleSystem& particleSystem, PxU64 rigidId, PxU64 compressedParticleId)
+{
+	// this is the ID into the flat uber buffer
+	PxU32 particleId = PxGetParticleIndex(compressedParticleId);
+
+	PxU32 id = findRange(particleId, particleSystem.mParticleBufferRunsum, particleSystem.mCommonData.mNumParticleBuffers);
+	
+	PxgParticleSimBuffer& particleBuffer = particleSystem.mParticleSimBuffers[id];	
+	return find(particleBuffer.mFilterPairs, particleBuffer.mNumFilterPairs, rigidId, particleId - particleSystem.mParticleBufferRunsum[id]);
+}
+
+static __device__ bool find(const PxgNonRigidFilterPair* pairs, const PxU32 nbPairs, PxgNonRigidFilterPair& pair, bool ignoreIndex2)
+{
+	//Binary search...
+	PxU32 l = 0, r = nbPairs;
+
+	while (l < r)
+	{
+		PxU32 mid = (l + r) / 2;
+		const PxgNonRigidFilterPair& p = pairs[mid];
+		PxI32 cmp = pair.compare(p);
+
+		if (cmp == 0 || (ignoreIndex2 && pair.index0 == p.index0 && pair.index1 == p.index1))
+			return true;
+		if (cmp < 0)
+			r = mid;
+		else
+			l = mid + 1;
+
+	}
+	return false;
+}
+
+static __device__ bool find(const PxgNonRigidFilterPair* pairs, const PxU32 nbPairs, const PxU32 compressedId0, const PxU32 compressedId1)
+{
+	PxgNonRigidFilterPair pair;
+	pair.index0 = compressedId0;
+	pair.index1 = compressedId1;
+	pair.index2 = 0;
+	return find(pairs, nbPairs, pair, true);
+}
+
+static __device__ bool find(const PxgNonRigidFilterPair* pairs, const PxU32 nbPairs, const PxU32 compressedId0, const PxU32 compressedId1, PxU32 uniqueParticleUserBufferId)
+{
+	PxgNonRigidFilterPair pair;
+	pair.index0 = compressedId0;
+	pair.index1 = compressedId1;
+	pair.index2 = uniqueParticleUserBufferId;
+
+	return find(pairs, nbPairs, pair, false);
+}
+
+static __device__ bool find(const PxgParticleSystem& particleSystem, const PxgNonRigidFilterPair* pairs, const PxU32 nbPairs, const PxU32 compressedParticleId, const PxU32 compressedId1)
+{
+	PxU32 particleId = PxGetParticleIndex(compressedParticleId);
+
+	PxU32 id = findRange(particleId, particleSystem.mParticleBufferRunsum, particleSystem.mCommonData.mNumParticleBuffers);
+
+	PxgParticleSimBuffer& particleBuffer = particleSystem.mParticleSimBuffers[id];
+	return find(pairs, nbPairs, compressedParticleId, compressedId1, particleBuffer.mUniqueId);
+}
+
+#endif // __DEFORMABLE_ELEMENT_FILTER_CUH__
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/distanceSegmentSegment.cuh
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/distanceSegmentSegment.cuh
@@ -0,0 +1,90 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __DIS_SEG_SEG_CUH__
+#define __DIS_SEG_SEG_CUH__
+
+__device__ __forceinline__ static
+void distanceSegmentSegmentSquared(
+	const PxVec3& p1, const PxVec3& d1,
+	const PxVec3& p2, const PxVec3& d2,
+	PxReal& s, PxReal& t)
+{
+	//const FloatV zero = FZero();
+	//const FloatV one = FOne();
+	//const FloatV eps = FEps();
+
+	const PxReal eps = 1e-6f;
+	const PxVec3 r = p1 - p2;
+	const PxReal a = d1.dot(d1);
+	const PxReal e = d2.dot(d2);
+	const PxReal b = d1.dot(d2);
+	const PxReal c = d1.dot(r);
+	const PxReal aRecip = a > eps ? (1.f / a) : 0.f;
+	const PxReal eRecip = e > eps ? (1.f / e) : 0.f;
+	const PxReal f = d2.dot(r);
+
+	/*
+		s = (b*f - c*e)/(a*e - b*b);
+		t = (a*f - b*c)/(a*e - b*b);
+
+		s = (b*t - c)/a;
+		t = (b*s + f)/e;
+	*/
+
+	//if segments not parallel, the general non-degenerated case, compute closest point on two segments and clamp to segment1
+	const PxReal denom = a * e - b * b;
+	const PxReal temp = b * f - c * e;
+	const PxReal s0 = PxClamp(temp / denom, 0.f, 1.f);
+
+	//if segment is parallel, demon < eps
+	//const bool con2 = eps > denom;
+	const PxReal sTmp = eps > denom ? 0.5f : s0;
+	//const BoolV con2 = FIsGrtr(eps, denom);//FIsEq(denom, zero);
+	//const FloatV sTmp = FSel(con2, FHalf(), s0);
+
+	//compute point on segment2 closest to segment1
+	const PxReal tTmp = (b * sTmp + f) * eRecip;
+	//const FloatV tTmp = FMul(FScaleAdd(b, sTmp, f), eRecip);
+
+	//if t is in [zero, one], done. otherwise clamp t
+	const PxReal t2 = PxClamp(tTmp, 0.f, 1.f);
+	//const FloatV t2 = FClamp(tTmp, zero, one);
+
+	//recompute s for the new value
+	const PxReal comp = (b * t2 - c) * aRecip;
+	const PxReal s2 = PxClamp(comp, 0.f, 1.f);
+	//const FloatV comp = FMul(FSub(FMul(b, t2), c), aRecip);
+	//const FloatV s2 = FClamp(comp, zero, one);
+
+	s = s2;
+	t = t2;
+}
+
+
+#endif
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/epa.cuh
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/epa.cuh
@@ -0,0 +1,530 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __EPA_CUH__
+#define __EPA_CUH__
+
+#include <assert.h>
+
+#include "nputils.cuh"
+#include "schlockShared.h"
+
+namespace squawk
+{
+
+using namespace physx;
+
+struct EpaScratch
+{
+	PxVec3 vertices[schlock::EPA_MAX_VERTS];
+	PxVec3 bestDir;
+	PxReal bestDist;
+	PxU16 indices[schlock::EPA_MAX_VERTS];
+	PxU8 loop[schlock::EPA_MAX_VERTS];
+	PxU8 edges[schlock::EPA_MAX_VERTS];
+	PxU8 next[schlock::EPA_MAX_VERTS]; 
+};
+
+
+__constant__ __device__ schlock::Vec3I8 sStartVertices4[4] =		{	MAKE_Vec3I8(0,1,2),	MAKE_Vec3I8(3,1,0),	MAKE_Vec3I8(3,2,1),	MAKE_Vec3I8(3,0,2) };
+__constant__ __device__ schlock::Vec3I8 sStartAdjacencies4[4] =	{	MAKE_Vec3I8(2,3,1),	MAKE_Vec3I8(0,3,2),	MAKE_Vec3I8(0,1,3),	MAKE_Vec3I8(0,2,1) };
+
+__constant__ __device__ schlock::Vec3I8 sStartVertices3[2] =		{	MAKE_Vec3I8(0,1,2),	MAKE_Vec3I8(1,0,2) };
+__constant__ __device__ schlock::Vec3I8 sStartAdjacency3[2] =	{	MAKE_Vec3I8(1,1,1),	MAKE_Vec3I8(0,0,0)	};
+
+
+
+typedef int VMapType;
+
+__device__ inline bool replaceTriangle(EpaScratch& epaS, int nbVertices, int triIndex,
+												PxReal lowerBound, PxVec3& normal, PxReal& distance,  
+												schlock::Vec3I8& v, schlock::Vec3I8& a, int& liveB, const PxReal releps)
+{
+	// to make it easier to keep track, we use a 'B' suffix for a bitmap where each bit represents a lane (and therefore a triangle)
+	int tI = threadIdx.x, laneB = 1<<tI;			
+	//PxVec3 apex = ldS(S.vertices[nbVertices-1]);
+	PxVec3 apex = epaS.vertices[nbVertices - 1];
+
+	// triangles that are silhouetted by the new vertex are threatened, but not yet killed
+	bool threatened = liveB&laneB && (normal.dot(apex) - distance >= /*-*/releps);
+	schlock::EPABitMap threatenedB = __ballot_sync(FULL_MASK, threatened);
+
+	if (threatenedB == 0 || threatenedB == liveB || !(threatenedB & (1<<triIndex)))
+	{
+#if EPA_LOG_DEGENERATE_CASES
+		if(threadIdx.x == 0)
+			printf("degenrate: threatenedB %d liveB %d < tri %d \n", threatenedB, liveB, tri);
+#endif
+
+		return false;
+	}
+
+	// the threatened set is not guaranteed connected, so flood fill to find killB, a bitmap of tris to remove
+	schlock::EPABitMap b0B = 1<<a.get(0), b1B = 1<<a.get(1), b2B = 1<<a.get(2);
+	// nbr = threatened neighbors of threatened triangles
+	schlock::EPABitMap killB = 1<<triIndex, nbr = (b0B | b1B | b2B) & (threatened?threatenedB:0);
+
+	// flood the threatened triangles to find the killed ones	
+	//for(schlock::EPABitMap pB = 0; pB != killB; pB = killB, killB |= __ballot(nbr&killB));
+
+	for (schlock::EPABitMap pB = 0; pB != killB;)
+	{
+		pB = killB;
+		killB |= __ballot_sync(FULL_MASK, nbr&killB);
+	}
+
+	//every triangles aren't alive and the triangles we want to kill in this iteration
+	schlock::EPABitMap deadB = (~liveB) | killB;
+
+	// d0, d1, d2 denote for each triangle whether the corresponding edge is a silhouette edge
+	const int alive = laneB&(~deadB);
+	const int d0 = alive && (b0B&deadB);
+	const int d1 = alive && (b1B&deadB);
+	const int d2 = alive && (b2B&deadB);
+	
+	// write the silhouette edges into the loop buffer
+	int v0 = v.get(0), v1 = v.get(1), v2 = v.get(2), z = 0;			
+
+	assert(!alive || v0 < schlock::EPA_MAX_VERTS);
+	assert(!alive || v1 < schlock::EPA_MAX_VERTS);
+	assert(!alive || v2 < schlock::EPA_MAX_VERTS);
+
+	// S.edges is the adjacency for the new cone tri on this edge, and
+	// also where the cone triangle will write back its index for later fixup
+	if (d0) 
+		z = v2, epaS.next[v2] = v1, epaS.edges[v2] = tI;
+	if (d1) 
+		z = v0, epaS.next[v0] = v2, epaS.edges[v0] = tI;
+	if (d2) 
+		z = v1, epaS.next[v1] = v0, epaS.edges[v1] = tI;
+
+	int d0d1d2B = __ballot_sync(FULL_MASK, d0|d1|d2);
+	
+	if (!d0d1d2B)
+	{
+#if EPA_LOG_DEGENERATE_CASES
+		if (threadIdx.x == 0)
+			printf("degenrate: d0d1d2B %d b0B %d b1B %d b2B %d deadB %d \n", d0d1d2B, b0B, b1B, b2B, deadB);
+#endif
+	
+		return false;
+	}
+
+	int loopStart = __shfl_sync(FULL_MASK, z, lowestSetIndex(d0d1d2B));		// pick a loop vert from the lowest indexed triangle that has one
+	int nbLoop = 0;
+	
+	__syncwarp();
+
+	if(tI == 0) 
+	{
+		assert(loopStart < schlock::EPA_MAX_VERTS);
+		
+		epaS.loop[nbLoop++] = loopStart;									// flatten the linked list into a vertex array							
+		for(int vertex = epaS.next[loopStart]; vertex!=loopStart; vertex = epaS.next[vertex])	
+		{
+			if(vertex >= schlock::EPA_MAX_VERTS)
+			{
+#if EPA_LOG_DEGENERATE_CASES
+				printf("degenrate: vertex %d\n", vertex);
+#endif
+
+				nbLoop = -1;
+				break;
+			}
+
+			if(nbLoop >= schlock::EPA_MAX_VERTS - 1)
+			{
+#if EPA_LOG_DEGENERATE_CASES
+				printf("degenrate: nbLoop %d\n", nbLoop);
+#endif
+
+				nbLoop = -1;
+				break;
+			}
+			
+			epaS.loop[nbLoop++] = vertex;
+		}
+
+#if EPA_REPLACE_TRIANGLE_LOGGING
+		for(int i=0;i<nbLoop;i++)
+			printf("%d, ", S.loop[i]);
+		printf("(starting at %d)\n",loopStart);
+#endif
+	}
+
+		
+	nbLoop = __shfl_sync(FULL_MASK, nbLoop, 0);
+	
+	if(nbLoop != __popc(__ballot_sync(FULL_MASK, d0)) + __popc(__ballot_sync(FULL_MASK, d1)) + __popc(__ballot_sync(FULL_MASK, d2)))
+	{
+#if EPA_LOG_DEGENERATE_CASES
+		{
+			printf("nbLoop %d __ballot(d0) %d __ballot(d1) %d __ballot(d2) %d tid %d", nbLoop, __ballot(d0), __ballot(d1), __ballot(d2), threadIdx.x);
+		}
+#endif
+		return false;
+	}
+	
+	__syncwarp();
+
+	// make the cone with the nbLoop least non-survivors 
+	int coneIndex = __popc(deadB&(laneB-1)), coneB = __ballot_sync(FULL_MASK, deadB&laneB && coneIndex < nbLoop);
+	bool fail = false;
+	if(coneB & laneB)
+	{ 
+		assert(coneIndex < nbLoop);
+		assert(nbLoop < schlock::EPA_MAX_VERTS);
+
+		int v0 = nbVertices-1, v1 = epaS.loop[coneIndex], v2 = epaS.loop[coneIndex+1 == nbLoop ? 0 : coneIndex+1];
+		v.setInt(MAKE_Vec3I8(v0, v1, v2));
+		
+		assert(v0 < schlock::EPA_MAX_VERTS);
+		assert(v1 < schlock::EPA_MAX_VERTS);
+		assert(v2 < schlock::EPA_MAX_VERTS);
+
+		a.set(0, epaS.edges[v1]);
+		a.set(1, lowestSetIndex(coneB & ~(coneIndex+1 == nbLoop ? 0 : 2*laneB-1)));		// previous bit in the active mask, wrapping around
+		a.set(2, highestSetIndex(coneB & ((coneIndex==0 ? 0 : laneB)-1)));				// next bit in the active mask, wrapping around
+
+
+		epaS.edges[v1] = tI;																// write our index back into the buffer for later fixup
+
+		//PxVec3 p0 = ldS(S.vertices[v0]), p1 = ldS(S.vertices[v1]), p2 = ldS(S.vertices[v2]);	
+		PxVec3 p0 = epaS.vertices[v0], p1 = epaS.vertices[v1], p2 = epaS.vertices[v2];
+		PxVec3 e01 = p1 - p0, e02 = p2 - p0;
+		PxVec3 n;
+		
+		if (e01.magnitudeSquared() < e02.magnitudeSquared())
+		{
+			n = e01.cross(p2 - p1);
+		}
+		else
+		{
+			n = e02.cross(p2 - p1);
+		}
+
+		PxReal nm = n.magnitude();
+		normal = n/nm;
+		distance = normal.dot(p0);
+		/*fail = nm == 0 || (lowerBound > 0.0f && distance < lowerBound);
+		if (fail)
+		{
+			printf("fail nm %f\ lowerBound %f distance %f \n", nm, lowerBound, distance);
+		}*/
+		fail = nm == 0 || (lowerBound > 0.0f && (distance - lowerBound) < -1e-4f);
+	}
+
+	__syncwarp();
+
+	// ... to fix up the silhouette adjacency with the new tris
+	if(d0) a.set(0, epaS.edges[v.get(2)]);													
+	if(d1) a.set(1, epaS.edges[v.get(0)]);
+	if(d2) a.set(2, epaS.edges[v.get(1)]);
+
+	liveB = (liveB & ~deadB) | coneB;
+	return __ballot_sync(FULL_MASK, fail) == 0;
+}
+
+		
+__device__ inline void epaStart4(EpaScratch& S,
+										  const schlock::GjkCachedData* data, 
+										  schlock::Vec3I8& a, schlock::Vec3I8& v,
+										  PxVec3& normal,
+										  PxReal& distance)
+{
+	int tI = threadIdx.x;
+	PxReal input = reinterpret_cast<const PxReal*>(data)[tI];
+	if(tI<16)
+	{
+		if((tI&3)<3)
+			(reinterpret_cast<volatile PxReal*>(S.vertices))[tI-tI/4] = input;
+		else
+			S.indices[tI>>2] = __float_as_int(input);
+	}
+
+	PxReal d = dot(input, input), m = __shfl_sync(FULL_MASK, 1/sqrt(d), tI*4+16);
+
+	normal.x = __shfl_sync(FULL_MASK, input, tI*4+16) * m;
+	normal.y = __shfl_sync(FULL_MASK, input, tI*4+17) * m;
+	normal.z = __shfl_sync(FULL_MASK, input, tI*4+18) * m;
+	distance = __shfl_sync(FULL_MASK, input, tI*4+19) * m;
+	if(tI<4)
+		a = sStartAdjacencies4[tI], v = sStartVertices4[tI];
+}
+
+
+__device__ inline PxU8 supportIndex(const PxVec3* v, PxU8 nbVertices, const PxVec3& dir)
+{
+	int bestN = 0xFFFFFFFF;
+	PxReal bestDist = -FLT_MAX;
+	for(PxU32 i=0;i<nbVertices;i+=32)
+	{
+		int n = i+threadIdx.x;
+		PxReal d = n < nbVertices ? dir.dot(v[n]) : -FLT_MAX, e;
+
+		e = fmaxf(d, __shfl_xor_sync(FULL_MASK, d, 16));
+		e = fmaxf(e, __shfl_xor_sync(FULL_MASK, e, 8));
+		e = fmaxf(e, __shfl_xor_sync(FULL_MASK, e, 4));
+		e = fmaxf(e, __shfl_xor_sync(FULL_MASK, e, 2));
+		e = fmaxf(e, __shfl_xor_sync(FULL_MASK, e, 1));
+
+		if(e>bestDist)
+			bestDist = e, bestN = i+__ffs(__ballot_sync(FULL_MASK, e == d))-1;
+	}
+
+	assert(bestN < nbVertices);
+	return PxU8(bestN);
+}
+
+// find a new support point in either the positive or negative direction furthest from the reference point
+// we could be a bit more efficient here but this is a super-rare case
+__device__ inline void newPoint(const PxVec3* vA, PxU32 nbA, const PxVec3* vB, PxU32 nbB, const PxVec3& dir, const PxVec3& ref, volatile PxVec3& vertex, volatile PxU16& indices)
+{
+	PxU8 indexAp = supportIndex(vA, nbA, dir), indexBp = supportIndex(vB, nbB, -dir);	// support point in +ve dir
+	PxU8 indexAn = supportIndex(vA, nbA, -dir), indexBn = supportIndex(vB, nbB, dir);	// support point in +ve dir
+	PxVec3 vp = vA[indexAp] - vB[indexBp], vn = vA[indexAn] - vB[indexBn];		
+
+	if(threadIdx.x == 0)
+	{
+		if(PxAbs((vp-ref).dot(dir)) > PxAbs((vn-ref).dot(dir)))
+		{
+			stS(vertex, vp);
+			indices = indexBp<<8 | indexAp;
+		}
+		else
+		{
+			stS(vertex, vn);
+			indices = indexBn<<8 | indexAn;
+		}
+	}
+}
+
+// find a new support point in either the positive or negative direction furthest from the reference point
+// we could be a bit more efficient here but this is a super-rare case
+			
+__device__ inline void epaStart123(volatile EpaScratch& S,
+											const schlock::GjkCachedData* data,
+											const PxVec3* vA, PxU32 nbA,
+											const PxVec3* vB, PxU32 nbB,
+											schlock::Vec3I8& a, schlock::Vec3I8& v,
+											PxVec3& normal,
+											PxReal& distance,
+											PxU32 warpMask)
+{
+	int tI = threadIdx.x;
+	PxReal input = reinterpret_cast<const PxReal*>(data)[tI];
+	if(tI<16)
+	{
+		if((tI&3)<3)
+			(reinterpret_cast<volatile PxReal*>(S.vertices))[tI-tI/4] = input;
+		else
+			S.indices[tI>>2] = __float_as_int(input);
+	}
+
+	__syncwarp(warpMask); //S.vertices is written above and read below
+
+	PxVec3 vertex0 = ldS(S.vertices[0]);
+
+	if(data->size<2)
+		newPoint(vA, nbA, vB, nbB, PxVec3(1,0,0), ldS(S.vertices[0]), S.vertices[1], S.indices[1]);
+
+	__syncwarp(warpMask); //S.vertices is written (inside newPoint) above and read below
+	PxVec3 vertex1 = ldS(S.vertices[1]);
+
+	if(data->size<3)
+	{
+		PxVec3 dir = vertex1 - vertex0, s = (PxAbs(dir.x) < PxAbs(dir.y) ? PxVec3(1,0,0) : PxVec3(0,1,0)).cross(dir);
+		newPoint(vA, nbA, vB, nbB, s, vertex0, S.vertices[2], S.indices[2]);
+	}
+
+	__syncwarp(warpMask); //S.vertices is written (inside newPoint) above and read below
+	PxVec3 vertex2 = ldS(S.vertices[2]);
+
+	PxVec3 e01 = vertex1 - vertex0, e12 = vertex2 - vertex1, e02 = vertex2 - vertex0;
+
+	if (e01.magnitude()<=e02.magnitude() )
+	{
+		normal = e01.cross(e12).getNormalized();
+	}
+	else
+	{
+		normal = e02.cross(e12).getNormalized();
+	}
+
+	distance = normal.dot(vertex0);
+		
+	if(tI < 2)
+		v = sStartVertices3[tI], a = sStartAdjacency3[tI];
+	if(tI==1)
+		normal = -normal, distance = -distance;
+}
+
+
+__device__ inline schlock::GjkResult::Enum epa(EpaScratch& epaS,
+								const PxVec3* vA, PxU32 nbA, 
+								const PxVec3* vB, PxU32 nbB, 
+								const schlock::GjkCachedData* input,
+								PxReal convergenceRatio, 
+								PxReal relEpsBase,
+								schlock::GjkOutput& output)
+{
+	PxReal releps = relEpsBase * (1-convergenceRatio);
+
+	schlock::Vec3I8 a, v;					// per-triangle adjancencies and vertex indices
+	PxVec3 normal;					// per-triangle normal
+	PxReal distance;				// per-triangle distance
+	schlock::Vec3I8 bestVerts;
+
+	PxReal lowerBound = -PX_MAX_F32, upperBound = PX_MAX_F32;
+	int nbVertices, liveTriangles;
+
+	bool b = input->size == 4;
+	PxU32 warpMask = __ballot_sync(FULL_MASK, !b);
+
+	if(b)
+	{
+		epaStart4(epaS, input, a, v, normal, distance);
+		nbVertices = 4;
+		//This is a tetrahedron so there are 4 triangles, hence liveTriangles = 0xf (1111)
+		liveTriangles = 0xf;
+	}
+	else
+	{
+		epaStart123(epaS, input, vA, nbA, vB, nbB, a, v, normal, distance, warpMask);
+		nbVertices = 3;
+		//This is a double-sided triangle so there are 2 triangles, hence liveTriangles = 0x3 (11)
+		liveTriangles = 0x3;
+	}
+
+	__syncwarp();
+			
+	// TODO: check here for vertex count is slighly conservative
+	bool nondegenerate = true;
+	while (nondegenerate && nbVertices < schlock::EPA_MAX_VERTS)
+	{
+		PxReal t = liveTriangles & 1 << threadIdx.x ? distance : FLT_MAX, e;
+		//get the shortest distance triangle
+		e = fminf(t, __shfl_xor_sync(FULL_MASK, t, 16));
+		e = fminf(e, __shfl_xor_sync(FULL_MASK, e, 8));
+		e = fminf(e, __shfl_xor_sync(FULL_MASK, e, 4));
+		e = fminf(e, __shfl_xor_sync(FULL_MASK, e, 2));
+		e = fminf(e, __shfl_xor_sync(FULL_MASK, e, 1));
+
+		//try to find out which triangle index has the shortest distance
+		int f = __ballot_sync(FULL_MASK, e == t);
+		assert(f); //Trigger assert if e == t failed on all lanes (indicates a nan)
+		int triIndex = lowestSetIndex(f);
+
+		//get the normal from the shortest distance triangle
+		PxVec3 d(__shfl_sync(FULL_MASK, normal.x, triIndex), __shfl_sync(FULL_MASK, normal.y, triIndex), __shfl_sync(FULL_MASK, normal.z, triIndex));
+		//get the support vertex
+		PxU8 indexA = squawk::supportIndex(vA, nbA, d), indexB = squawk::supportIndex(vB, nbB, -d);
+		PxVec3 supportVertex = vA[indexA] - vB[indexB];
+
+		//calculate the distance from the origin to the support vertex
+		PxReal thisFrameUb = supportVertex.dot(d);
+
+		//found the MTD, epa terminate
+		if (abs(thisFrameUb - e) <= releps)
+		{
+			upperBound = thisFrameUb, lowerBound = e;
+			bestVerts.setInt(__shfl_sync(FULL_MASK, v.getInt(), triIndex));
+
+			if (threadIdx.x == 0)
+				epaS.bestDir = d;
+
+			break;
+		}
+
+		// all remaining tris are worse than the best candidate, epa terminate
+		if (e > upperBound)
+			break;
+
+
+		PxReal ub = PxMin(upperBound, thisFrameUb);
+
+		if (ub < upperBound)
+		{
+			upperBound = ub, lowerBound = e;
+			bestVerts.setInt(__shfl_sync(FULL_MASK, v.getInt(), triIndex));
+
+			if (threadIdx.x == 0)
+				epaS.bestDir = d;
+		}
+
+#if EPA_LOGGING
+		if (threadIdx.x == 0)
+			printf("[%f, %f], (%f, %f, %f)\n", lowerBound, upperBound, supportVertex.x, supportVertex.y, supportVertex.z);
+#endif
+
+		// save off the new vertex
+		if (threadIdx.x == 0)
+		{
+			epaS.vertices[nbVertices] = supportVertex;
+			epaS.indices[nbVertices] = PxU16(indexB) << 8 | PxU16(indexA);
+		}
+
+		__syncwarp();
+
+		nondegenerate = replaceTriangle(epaS, ++nbVertices, triIndex, e, normal, distance, v, a, liveTriangles, releps);
+	}
+
+#if EPA_LOGGING
+		if(!nondegenerate)
+			printf("degenerate\n");
+#endif
+
+	output.degenerate = !nondegenerate;
+
+	__syncwarp();
+
+	schlock::GjkResult::Enum result;
+	PxVec3 bestDir = epaS.bestDir;
+	PxVec3 closestPoint = bestDir * lowerBound;
+
+	if(threadIdx.x == 0)
+	{
+		output.direction = bestDir;
+		output.lowerBound = lowerBound;
+		output.upperBound = upperBound;
+
+		int v0 = bestVerts.get(0), v1 = bestVerts.get(1), v2 = bestVerts.get(2);
+		PxReal u, v, w;
+		schlock::barycentrics(epaS.vertices[v0], epaS.vertices[v1], epaS.vertices[v2], u, v, w, closestPoint);
+		PxVec3 cpA = vA[schlock::aIndex(epaS.indices[v0])] * u + vA[schlock::aIndex(epaS.indices[v1])] * v + vA[schlock::aIndex(epaS.indices[v2])] * w;
+		output.closestPointA = cpA;
+		output.closestPointB = cpA - closestPoint;
+	}
+	result = lowerBound < 0 ? schlock::GjkResult::eCLOSE : schlock::GjkResult::eOVERLAP;
+
+	return result;
+}
+}
+
+
+#endif
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/femClothClothMidPhase.cu
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/femClothClothMidPhase.cu
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/femClothHFMidPhase.cu
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/femClothHFMidPhase.cu
@@ -0,0 +1,397 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. 
+
+#include "foundation/PxBounds3.h"
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxTransform.h"
+#include "foundation/PxVec3.h"
+
+#include "geometry/PxGeometry.h"
+#include "geometry/PxHeightFieldSample.h"
+
+#include "PxgContactManager.h"
+#include "PxgConvexConvexShape.h"
+#include "PxgFEMCloth.h"
+#include "PxgNpKernelIndices.h"
+#include "PxgSoftBodyCoreKernelIndices.h"
+
+#include "PxsTransformCache.h"
+
+#include <vector_types.h>
+
+#include "PxgCommonDefines.h"
+#include "dataReadWriteHelper.cuh"
+#include "heightfieldUtil.cuh"
+#include "utils.cuh"
+#include "reduction.cuh"
+#include "deformableCollision.cuh"
+
+#include "assert.h"
+
+using namespace physx;
+
+extern "C" __host__ void initNarrowphaseKernels20() {}
+
+struct clothHeightfieldScratch
+{
+
+	/*const float4 * PX_RESTRICT trimeshVerts;
+	const uint4 * PX_RESTRICT trimeshTriVertIndices;
+	
+	PxU32 numTriangles;*/
+
+	PxTransform heightfieldTransform;
+	PxgShape heightfieldShape;
+
+}; PX_COMPILE_TIME_ASSERT(sizeof(clothHeightfieldScratch) <= 1024);
+
+
+
+__device__ static inline void clothHeightfieldMidphaseCore(
+	clothHeightfieldScratch* scratch,
+	const PxU32 cmIdx,
+	const PxBounds3 triangleBound,
+	const PxU32 triangleIdx,
+	const PxU32 workIndex,
+	const PxU32 numTriangles,
+	PxU8* PX_RESTRICT stackPtr,							//output
+	PxU32* PX_RESTRICT midphasePairsNum,				//output
+	const PxU32 stackSize
+)
+{
+	PxU32 nbPairs = 0;
+	PxBounds3 localBound;
+	PxU32 nbRows = 0;
+	PxU32 nbCols = 0;
+	PxHeightFieldSample* samples = NULL;
+	PxU32 minRow = 0;
+	PxU32 maxRow = 0;
+	PxU32 minColumn = 0;
+	PxU32 maxColumn = 0;
+
+	if (triangleIdx < numTriangles)
+	{
+		PxTransform heightfieldTransform = scratch->heightfieldTransform;
+
+		PxgShape heightfieldShape = scratch->heightfieldShape;
+		const PxReal oneOverHeightScale = 1.f / heightfieldShape.scale.scale.y;
+		const PxReal oneOverRowScale = 1.f / PxAbs(heightfieldShape.scale.scale.x);
+		const PxReal oneOverlColScale = 1.f / PxAbs(heightfieldShape.scale.scale.z);
+
+		//bound is in world space, we need to transform the bound to the local space of height field
+		localBound = PxBounds3::transformFast(heightfieldTransform.getInverse(), triangleBound);
+
+		localBound.minimum.x *= oneOverRowScale;
+		localBound.minimum.y *= oneOverHeightScale;
+		localBound.minimum.z *= oneOverlColScale;
+
+		localBound.maximum.x *= oneOverRowScale;
+		localBound.maximum.y *= oneOverHeightScale;
+		localBound.maximum.z *= oneOverlColScale;
+
+		//row scale
+		if (heightfieldShape.scale.scale.x < 0.f)
+		{
+			//swap min and max row scale
+			const PxReal temp = localBound.minimum.x;
+			localBound.minimum.x = localBound.maximum.x;
+			localBound.maximum.x = temp;
+		}
+
+		//col scale
+		if (heightfieldShape.scale.scale.z < 0.f)
+		{
+			PxReal swap = localBound.minimum.z;
+			localBound.minimum.z = localBound.maximum.z;
+			localBound.maximum.z = swap;
+		}
+
+		PxU32* heightfieldData = reinterpret_cast<PxU32*>(heightfieldShape.hullOrMeshPtr);
+		nbRows = heightfieldData[0];
+		nbCols = heightfieldData[1];
+		samples = reinterpret_cast<PxHeightFieldSample*>(&heightfieldData[2]);
+
+
+		if (!(localBound.minimum.x > nbRows - 1) || (localBound.minimum.z > nbCols - 1)
+			|| (localBound.maximum.x < 0) || (localBound.maximum.z < 0))
+		{
+			minRow = getMinRow(localBound.minimum.x, nbRows);
+			maxRow = getMaxRow(localBound.maximum.x, nbRows);
+			minColumn = getMinColumn(localBound.minimum.z, nbCols);
+			maxColumn = getMaxColumn(localBound.maximum.z, nbCols);
+
+			if ((2 * (maxColumn - minColumn) * (maxRow - minRow)) > 0)
+			{
+				nbPairs = heightfieldComputePairs(minColumn, maxColumn, minRow, maxRow, nbRows, nbCols,
+					samples, localBound.minimum.y, localBound.maximum.y);
+			}
+		}
+	}
+
+
+
+	PxU32 pairOffset = warpScan<AddOpPxU32, PxU32>(FULL_MASK, nbPairs) - nbPairs;
+
+	PxU32 validCount = 0;
+	if (threadIdx.x == (WARP_SIZE - 1))
+	{
+		validCount = pairOffset + nbPairs;
+	}
+	validCount = __shfl_sync(FULL_MASK, validCount, WARP_SIZE - 1);
+
+
+	PxU32 startIndex = 0xffffffff;
+	if (threadIdx.x == 0 && validCount > 0)
+	{
+		startIndex = atomicAdd(midphasePairsNum, validCount);
+	}
+
+	startIndex = __shfl_sync(FULL_MASK, startIndex, 0);
+
+	uint4* tStackPtr = reinterpret_cast<uint4*>(stackPtr);
+
+	if (nbPairs != 0)
+	{
+		heightfieldOutputPairs(minColumn, maxColumn, minRow, maxRow, nbRows, nbCols,
+			samples, localBound.minimum.y, localBound.maximum.y, cmIdx, triangleIdx, startIndex + pairOffset, stackSize, tStackPtr);
+	}
+}
+
+extern "C" __global__ void cloth_heightfieldMidphaseGeneratePairsLaunch(
+	const PxU32 numNPWorkItems,
+	const PxReal toleranceLength,
+	const PxgContactManagerInput* PX_RESTRICT cmInputs,
+	const PxsCachedTransform* PX_RESTRICT transformCache,
+	const PxReal* PX_RESTRICT contactDistance,
+	const PxgShape* PX_RESTRICT gpuShapes,
+
+	PxgFEMCloth* PX_RESTRICT clothes,
+
+	PxU8* stackPtr,							//output
+	PxU32* midphasePairsNum,				//output
+	const PxU32 stackSizeBytes
+)
+{
+
+	__shared__ PxU32 scratchMem[1024];
+
+	clothHeightfieldScratch* s_warpScratch = reinterpret_cast<clothHeightfieldScratch*>(scratchMem);
+
+	const PxU32 nbWarpsPerCm = 64; // 64 warp(2048 threads) to deal with one pair
+	const PxU32 nbWarpsRequired = nbWarpsPerCm * numNPWorkItems;
+
+	const PxU32 nbThreadsPerCm = 64 * WARP_SIZE;
+
+	const PxU32 numWarpPerBlock = MIDPHASE_WARPS_PER_BLOCK;
+	const PxU32 totalNumWarps = PxgSoftBodyKernelGridDim::SB_SBMIDPHASE * numWarpPerBlock;
+
+	const PxU32 warpIndex = threadIdx.y;
+
+	const PxU32 nbIterationsPerWarps = (nbWarpsRequired + totalNumWarps - 1) / totalNumWarps;
+
+	/*if (warpIndex == 0 && threadIdx.x == 0 && blockIdx.x == 0)
+	{
+		printf("nbIterationsPerWarps %i nbWarpsRequired %i\n", nbIterationsPerWarps, nbWarpsRequired);
+	}*/
+
+	for (PxU32 i = 0; i < nbIterationsPerWarps; ++i)
+	{
+		const PxU32 workIndex = i + (warpIndex + numWarpPerBlock * blockIdx.x) * nbIterationsPerWarps;
+
+		if (workIndex < nbWarpsRequired)
+		{
+
+			unsigned int cmIdx = workIndex / nbWarpsPerCm;
+
+			PxgShape clothShape, heightfieldShape;
+			PxU32 clothCacheRef, heightfieldCacheRef;
+			LoadShapePair<PxGeometryType::eTRIANGLEMESH, PxGeometryType::eHEIGHTFIELD>(cmInputs, cmIdx, gpuShapes,
+				clothShape, clothCacheRef, heightfieldShape, heightfieldCacheRef);
+
+			PxsCachedTransform heightfieldTransformCache;
+			PxsCachedTransform_ReadWarp(heightfieldTransformCache, transformCache + heightfieldCacheRef);
+
+			const PxgFEMCloth& cloth = clothes[clothShape.particleOrSoftbodyId];
+
+			if (threadIdx.x == 0)
+			{
+				s_warpScratch->heightfieldTransform = heightfieldTransformCache.transform;
+				s_warpScratch->heightfieldShape = heightfieldShape;
+			}
+
+			__syncwarp();
+
+			const PxReal contactDist = contactDistance[clothCacheRef] + contactDistance[heightfieldCacheRef];
+
+			const PxU32 nbTriangles = cloth.mNbTriangles;// s_warpScratch->numTriangles;
+
+			const PxU32 nbIter = (nbTriangles + nbThreadsPerCm - 1) / nbThreadsPerCm;
+
+			const uint4* triVertIndices = cloth.mTriangleVertexIndices;// s_warpScratch->trimeshTriVertIndices;
+
+			const float4* positions = cloth.mPosition_InvMass;
+			
+			for (PxU32 j = 0; j < nbIter; ++j)
+			{
+				const PxU32 wrappedThreadIndex = (workIndex * WARP_SIZE + threadIdx.x) % nbThreadsPerCm;
+
+				const PxU32 triangleIdx = wrappedThreadIndex + nbThreadsPerCm * j;
+
+				PxBounds3 triangleBound = PxBounds3::empty();
+				
+				if (triangleIdx < nbTriangles)
+				{
+					const uint4 vertIdx = triVertIndices[triangleIdx];
+					const PxVec3 worldV0 = PxLoad3(positions[vertIdx.x]);
+					const PxVec3 worldV1 = PxLoad3(positions[vertIdx.y]);
+					const PxVec3 worldV2 = PxLoad3(positions[vertIdx.z]);
+
+					triangleBound = triBoundingBox(worldV0, worldV1, worldV2);
+
+					triangleBound.fattenFast(contactDist);
+				}
+
+				clothHeightfieldMidphaseCore(
+					s_warpScratch,
+					cmIdx,
+					triangleBound,
+					triangleIdx,
+					workIndex,
+					nbTriangles,
+					stackPtr,
+					midphasePairsNum,
+					stackSizeBytes/sizeof(uint4)
+				);
+			}
+		}
+	}
+}
+
+
+extern "C" __global__ void cloth_midphaseVertexHeightfieldLaunch(
+	const PxU32 numNPWorkItems,
+	const PxReal toleranceLength,
+	const PxgContactManagerInput* PX_RESTRICT cmInputs,
+	const PxsCachedTransform* PX_RESTRICT transformCache,
+	const PxReal* PX_RESTRICT contactDistance,
+	const PxgShape* PX_RESTRICT gpuShapes,
+
+	PxgFEMCloth* PX_RESTRICT clothes,
+
+	PxU8* stackPtr,							//output
+	PxU32* midphasePairsNum,				//output
+	const PxU32 stackSizeBytes
+)
+{
+
+	__shared__ PxU32 scratchMem[1024];
+
+	clothHeightfieldScratch* s_warpScratch = reinterpret_cast<clothHeightfieldScratch*>(scratchMem);
+
+	const PxU32 nbWarpsPerCm = 64; // 64 warp(2048 threads) to deal with one pair
+	const PxU32 nbWarpsRequired = nbWarpsPerCm * numNPWorkItems;
+
+	const PxU32 nbThreadsPerCm = 64 * WARP_SIZE;
+
+	const PxU32 numWarpPerBlock = MIDPHASE_WARPS_PER_BLOCK;
+	const PxU32 totalNumWarps = PxgSoftBodyKernelGridDim::SB_SBMIDPHASE * numWarpPerBlock;
+
+	const PxU32 warpIndex = threadIdx.y;
+
+	const PxU32 nbIterationsPerWarps = (nbWarpsRequired + totalNumWarps - 1) / totalNumWarps;
+
+	for (PxU32 i = 0; i < nbIterationsPerWarps; ++i)
+	{
+		const PxU32 workIndex = i + (warpIndex + numWarpPerBlock * blockIdx.x) * nbIterationsPerWarps;
+
+		if (workIndex < nbWarpsRequired)
+		{
+
+			unsigned int cmIdx = workIndex / nbWarpsPerCm;
+
+			PxgShape clothShape, heightfieldShape;
+			PxU32 clothCacheRef, heightfieldCacheRef;
+			LoadShapePairWarp<PxGeometryType::eTRIANGLEMESH, PxGeometryType::eHEIGHTFIELD>(cmInputs, cmIdx, gpuShapes,
+				clothShape, clothCacheRef, heightfieldShape, heightfieldCacheRef);
+			
+			PxsCachedTransform heightfieldTransformCache;
+			PxsCachedTransform_ReadWarp(heightfieldTransformCache, transformCache + heightfieldCacheRef);
+
+			const PxgFEMCloth& cloth = clothes[clothShape.particleOrSoftbodyId];
+
+			if (threadIdx.x == 0)
+			{
+				s_warpScratch->heightfieldTransform = heightfieldTransformCache.transform;
+				s_warpScratch->heightfieldShape = heightfieldShape;
+			}
+
+			__syncwarp();
+
+			const PxReal contactDist = contactDistance[clothCacheRef] + contactDistance[heightfieldCacheRef];
+
+			const float4* positions = cloth.mPosition_InvMass;
+
+			const PxU32 nbVerts = cloth.mNbVerts;
+
+			const PxU32 nbIter = (nbVerts + nbThreadsPerCm - 1) / nbThreadsPerCm;
+
+			//const uint4* triVertIndices = s_warpScratch->trimeshTriVertIndices;
+
+			const PxVec3 radiusV(contactDist);
+
+			for (PxU32 j = 0; j < nbIter; ++j)
+			{
+				const PxU32 wrappedThreadIndex = (workIndex * WARP_SIZE + threadIdx.x) % nbThreadsPerCm;
+
+				const PxU32 vertIndex = wrappedThreadIndex + nbThreadsPerCm * j;
+
+				const float4 pos = positions[vertIndex];
+
+				//particle pos need to be in triangle mesh vertex space
+				const PxVec3 particlePos(pos.x, pos.y, pos.z);
+
+				PxBounds3 triangleBound = PxBounds3::empty();
+
+				triangleBound.minimum = particlePos - radiusV;
+				triangleBound.maximum = particlePos + radiusV;
+
+				clothHeightfieldMidphaseCore(
+					s_warpScratch,
+					cmIdx,
+					triangleBound,
+					vertIndex,
+					workIndex,
+					nbVerts,
+					stackPtr,
+					midphasePairsNum,
+					stackSizeBytes/sizeof(uint4)
+				);
+			}
+		}
+	}
+}
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/femClothMidPhase.cu
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/femClothMidPhase.cu
@@ -0,0 +1,620 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. 
+
+#include "foundation/PxBounds3.h"
+#include "foundation/PxMath.h"
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxTransform.h"
+#include "foundation/PxVec3.h"
+
+#include "geometry/PxGeometry.h"
+#include "geometry/PxMeshScale.h"
+
+#include "GuBV32.h"
+#include "cudaNpCommon.h"
+
+#include "PxgContactManager.h"
+#include "PxgConvexConvexShape.h"
+#include "PxgFEMCloth.h"
+#include "PxgNpKernelIndices.h"
+#include "PxgParticleSystem.h"
+
+#include "PxsTransformCache.h"
+
+#include <vector_types.h>
+
+#include "copy.cuh"
+#include "dataReadWriteHelper.cuh"
+#include "femMidphaseScratch.cuh"
+#include "utils.cuh"
+
+#include "bv32Traversal.cuh"
+#include "triangleMesh.cuh"
+#include "deformableCollision.cuh"
+
+using namespace physx;
+
+extern "C" __host__ void initNarrowphaseKernels18() {}
+
+struct ClothTreeTraverser
+{
+	femMidphaseScratch* s_warpScratch;
+	uint4* PX_RESTRICT stackPtr;
+	PxU32* PX_RESTRICT midphasePairsNum;
+	unsigned int cmIdx;
+	const PxU32 stackSize;
+	const PxBounds3& mBox;
+
+	PX_FORCE_INLINE __device__ ClothTreeTraverser(
+		femMidphaseScratch* s_warpScratch, 
+		PxBounds3& aabb,
+		uint4* PX_RESTRICT stackPtr,
+		PxU32* PX_RESTRICT midphasePairsNum,
+		unsigned int cmIdx,
+		const PxU32 stackSize)
+		: s_warpScratch(s_warpScratch), mBox(aabb), stackPtr(stackPtr), midphasePairsNum(midphasePairsNum),
+		  cmIdx(cmIdx), stackSize(stackSize)
+	{ }
+
+	PX_FORCE_INLINE __device__ void intersectPrimitiveFullWarp(PxU32 primitiveIndex, PxU32 idxInWarp) const
+	{
+		bool intersect = false;
+		if (primitiveIndex != 0xFFFFFFFF) 
+		{
+			uint4 triIdx = s_warpScratch->meshVertsIndices[primitiveIndex];
+
+			const PxVec3 worldV0 = PxLoad3(s_warpScratch->meshVerts[triIdx.x]);
+			const PxVec3 worldV1 = PxLoad3(s_warpScratch->meshVerts[triIdx.y]);
+			const PxVec3 worldV2 = PxLoad3(s_warpScratch->meshVerts[triIdx.z]);			
+
+			intersect = triBoundingBox(worldV0, worldV1, worldV2).intersects(mBox);
+		}
+
+		pushOntoStackFullWarp(intersect, midphasePairsNum, stackPtr, stackSize, make_uint4(cmIdx, primitiveIndex, 0, 0));
+	}	
+
+	PX_FORCE_INLINE __device__ bool intersectBoxFullWarp(bool hasBox, const PxVec3& min, const PxVec3& max) const
+	{
+		if (hasBox)
+			return mBox.intersects(PxBounds3(min, max));
+		else
+			return false;
+	}
+};
+
+template<unsigned int WarpsPerBlock>
+__device__ static inline void femClothMidphaseCore(
+	PxU32 numNPWorkItems,
+	const PxReal toleranceLength,
+	const PxgContactManagerInput* PX_RESTRICT cmInputs,
+	const PxsCachedTransform* PX_RESTRICT transformCache,
+	const PxBounds3* PX_RESTRICT bounds,
+	const PxReal* PX_RESTRICT contactDistance,
+	const PxgShape* PX_RESTRICT gpuShapes,
+	const PxgFEMCloth* PX_RESTRICT femClothes,
+
+	PxU8* PX_RESTRICT stackPtr,									//output
+	PxU32* PX_RESTRICT midphasePairsNum,						//output
+
+	femMidphaseScratch*	s_warpScratch,
+	const PxU32 stackSize
+)
+{
+
+	//wrap index
+	const unsigned int warpIdx = threadIdx.y;//(idx >> LOG2_WARP_SIZE);
+											 //wrap index in block
+											 //const unsigned int idx = idxInWarp + warpIdx * WARP_SIZE;
+
+	unsigned int cmIdx = warpIdx + blockIdx.x * blockDim.y;
+
+	//unsigned mask_cmIdx = __ballot_sync(FULL_MASK, cmIdx < numNPWorkItems);
+	if (cmIdx < numNPWorkItems)
+	{
+		/*	if(threadIdx.x == 0)
+				printf("cmIdx %i warpIdx %i\n", cmIdx, warpIdx);*/
+
+		PxgShape clothShape, rigidShape;
+		PxU32 clothCacheRef, rigidCacheRef;
+		LoadShapePairWarp<PxGeometryType::eTRIANGLEMESH>(cmInputs, cmIdx, gpuShapes,
+			clothShape, clothCacheRef, rigidShape, rigidCacheRef);
+
+		const PxU32 femClothId = clothShape.particleOrSoftbodyId;
+
+		const PxgFEMCloth& femCloth = femClothes[femClothId];
+
+		PxU8 * trimeshGeomPtr = reinterpret_cast<PxU8 *>(femCloth.mTriMeshData);
+
+		trimeshGeomPtr += sizeof(uint4); // skip nbVerts_nbTets_maxDepth_nbBv32TreeNodes
+
+		if (threadIdx.x == 0)
+		{
+			s_warpScratch->meshVerts = femCloth.mPosition_InvMass;
+			s_warpScratch->meshVertsIndices = femCloth.mTriangleVertexIndices;
+			
+			Gu::BV32DataPacked* bv32PackedNodes = reinterpret_cast<Gu::BV32DataPacked*>(trimeshGeomPtr);
+			s_warpScratch->bv32PackedNodes = bv32PackedNodes;
+		}
+
+		__syncwarp();
+
+
+		//ML: this is the world bound AABB
+		PxBounds3 aabb;
+		PxBounds3_ReadWarp(aabb, bounds + rigidCacheRef);
+		const PxReal contactDist = contactDistance[clothCacheRef] + contactDistance[rigidCacheRef];
+		aabb.fattenFast(contactDist);
+
+		/*if (threadIdx.x == 0)
+		{
+			printf("contactDist %f transformCacheRef0 %i transformCacheRef1 %i\n", contactDist, transformCacheRef0, transformCacheRef1);
+			printf("min(%f, %f, %f) max(%f, %f, %f)\n", aabb.minimum.x, aabb.minimum.y, aabb.minimum.z,
+				aabb.maximum.x, aabb.maximum.y, aabb.maximum.z);
+		}*/
+
+		uint4* tStackPtr = reinterpret_cast<uint4*>(stackPtr);
+		bv32TreeTraversal<const ClothTreeTraverser, WarpsPerBlock>(s_warpScratch->bv32PackedNodes, s_warpScratch->sBv32Nodes, ClothTreeTraverser(
+			s_warpScratch, aabb, tStackPtr, midphasePairsNum, cmIdx, stackSize));
+
+		/*if (threadIdx.x == 0 && nbPairsPerCM > 0)
+		printf("nbPairsPerCM %i\n", nbPairsPerCM);*/
+	}
+}
+
+extern "C" __global__
+//__launch_bounds__(MIDPHASE_WARPS_PER_BLOCK * WARP_SIZE, 4)
+void cloth_midphaseGeneratePairsLaunch(
+	PxU32 numWorkItems,
+	const PxReal tolerenceLength,
+	const PxgContactManagerInput* PX_RESTRICT cmInputs,
+	const PxsCachedTransform* PX_RESTRICT transformCache,
+	const PxBounds3* PX_RESTRICT bounds,
+	const PxReal* PX_RESTRICT contactDistance,
+	const PxgShape* PX_RESTRICT gpuShapes,
+	const PxgFEMCloth* PX_RESTRICT femClothes,
+
+	PxU8* PX_RESTRICT stackPtr,							//output
+	PxU32* PX_RESTRICT midphasePairsNum,				//output
+	const PxU32 stackSizeBytes
+)
+{
+
+	__shared__ PxU32 scratchMem[MIDPHASE_WARPS_PER_BLOCK][WARP_SIZE * 7];
+
+	femClothMidphaseCore<MIDPHASE_WARPS_PER_BLOCK>(
+		numWorkItems,
+		tolerenceLength,
+		cmInputs,
+		transformCache,
+		bounds,
+		contactDistance,
+		gpuShapes,
+		femClothes,
+		stackPtr,
+		midphasePairsNum,
+		(femMidphaseScratch*)scratchMem[threadIdx.y],
+		stackSizeBytes/sizeof(uint4)
+		);
+}
+
+
+extern "C" __global__
+//__launch_bounds__(MIDPHASE_WARPS_PER_BLOCK * WARP_SIZE, 4)
+void cloth_midphaseGenerateVertexPairsLaunch(
+	PxU32 numWorkItems,
+	const PxReal tolerenceLength,
+	const PxgContactManagerInput* PX_RESTRICT cmInputs,
+	const PxsCachedTransform* PX_RESTRICT transformCache,
+	const PxBounds3* PX_RESTRICT bounds,
+	const PxReal* PX_RESTRICT contactDistance,
+	const PxgShape* PX_RESTRICT gpuShapes,
+	const PxgFEMCloth* PX_RESTRICT femClothes,
+
+	uint4* PX_RESTRICT stackPtr,							//output
+	PxU32* PX_RESTRICT midphasePairsNum					//output
+)
+{
+
+	const unsigned int cmIdx = blockIdx.y;
+
+	//unsigned mask_cmIdx = __ballot_sync(FULL_MASK, cmIdx < numNPWorkItems);
+	if (cmIdx < numWorkItems)
+	{
+		PxgShape clothShape, rigidShape;
+		PxU32 clothCacheRef, rigidCacheRef;
+		LoadShapePairWarp<PxGeometryType::eTRIANGLEMESH>(cmInputs, cmIdx, gpuShapes,
+			clothShape, clothCacheRef, rigidShape, rigidCacheRef);
+
+		const PxU32 femClothId = clothShape.particleOrSoftbodyId;
+
+		const PxgFEMCloth& femCloth = femClothes[femClothId];
+
+		PxU8 * trimeshGeomPtr = reinterpret_cast<PxU8 *>(femCloth.mTriMeshData);
+
+		trimeshGeomPtr += sizeof(uint4);
+
+		const float4* const PX_RESTRICT meshVerts = femCloth.mPosition_InvMass;
+		const PxU32 nbVerts = femCloth.mNbVerts;
+
+		//ML: this is the world bound AABB
+		PxBounds3 aabb;
+		PxBounds3_ReadWarp(aabb, bounds + rigidCacheRef);
+		const PxReal contactDist = contactDistance[clothCacheRef] + contactDistance[rigidCacheRef];
+		//aabb.fattenFast(contactDist);
+
+		const PxReal tolerance = contactDist * contactDist;
+
+		const PxU32 idx = threadIdx.x + threadIdx.y*WARP_SIZE + blockIdx.x*blockDim.x*blockDim.y;
+
+		for (PxU32 i = 0; i < nbVerts; i += blockDim.x*blockDim.y*gridDim.x)
+		{
+			PxU32 id = idx + i;
+
+			bool hasHit = false;
+			if (id < nbVerts)
+			{
+				float4 vertex = meshVerts[id];
+				PxVec3 v(vertex.x, vertex.y, vertex.z);
+
+				PxVec3 closest = aabb.closestPoint(v);
+
+				hasHit = (closest - v).magnitudeSquared() <= tolerance;
+			}
+
+			PxU32 hitMask = __ballot_sync(FULL_MASK, hasHit);
+
+			if (hitMask)
+			{
+				PxU32 prevStackShift = 0xFFffFFff;
+				PxU32 validCount = __popc(hitMask);
+				if (threadIdx.x == 0 && validCount > 0)
+				{
+					prevStackShift = atomicAdd(midphasePairsNum, validCount);
+
+				}
+
+				prevStackShift = __shfl_sync(FULL_MASK, prevStackShift, 0);
+
+				if (hasHit)
+				{
+					PxU32 offset = warpScanExclusive(hitMask, threadIdx.x);
+					stackPtr[prevStackShift + offset] = make_uint4(cmIdx, id, 0, 0);
+				}
+			}
+		}
+	}
+}
+
+struct ClothParticleTreeTraverser
+{
+	femMidphaseScratch* PX_RESTRICT s_warpScratch;
+	const PxVec3 sphereCenter;
+	const PxReal radius;
+	const PxBounds3 mBox;
+	const PxU32 globalParticleIndex;
+	const PxU32 numParticles;
+	const PxU32 cmIdx;
+	const PxU32 stackSize;
+	uint4* PX_RESTRICT stackPtr;					//output
+	PxU32* midphasePairs;							//output
+	const bool isMesh;
+
+
+	PX_FORCE_INLINE __device__ ClothParticleTreeTraverser(
+		femMidphaseScratch* PX_RESTRICT s_warpScratch,
+		const PxVec3 sphereCenter,
+		const PxReal radius,
+		const PxU32 globalParticleIndex,
+		const PxU32 numParticles,
+		const PxU32 cmIdx,
+		const PxU32 stackSize,
+		uint4* PX_RESTRICT stackPtr,					//output
+		PxU32* midphasePairs,							//output
+		const bool isMesh) 
+		: s_warpScratch(s_warpScratch), sphereCenter(sphereCenter), radius(radius), globalParticleIndex(globalParticleIndex), numParticles(numParticles),
+		  cmIdx(cmIdx), stackSize(stackSize), stackPtr(stackPtr), midphasePairs(midphasePairs), isMesh(isMesh), mBox(sphereCenter - PxVec3(radius), sphereCenter + PxVec3(radius))
+	{ }
+
+	PX_FORCE_INLINE __device__ void intersectPrimitiveFullWarp(PxU32 primitiveIndex, PxU32 idxInWarp) const
+	{
+		bool intersect = false;
+		if (primitiveIndex != 0xFFFFFFFF) 
+		{
+			/*if (myFistSBIndex == 933 && tetrahedronIdxSecondSb == 578)
+			{
+			const PxU32 parentNodeIndex = currentNodeIndex >> 6;
+			printf("workIndex % i parentNodeIndex %i blockIdx.x %i tetrahedronIdxSecondSb %i\n", workIndex, parentNodeIndex, blockIdx.x, tetrahedronIdxSecondSb);
+			}*/
+
+			uint4 triIdx = s_warpScratch->meshVertsIndices[primitiveIndex];
+
+			const PxVec3 worldV0 = PxLoad3(s_warpScratch->meshVerts[triIdx.x]);
+			const PxVec3 worldV1 = PxLoad3(s_warpScratch->meshVerts[triIdx.y]);
+			const PxVec3 worldV2 = PxLoad3(s_warpScratch->meshVerts[triIdx.z]);
+
+			/*if ((triangleIdx == 6 || triangleIdx == 7))
+			{
+				printf("triangleIdx %i worldV0(%f, %f, %f), worldV1(%f, %f, %f), worldV2(%f, %f, %f)\n", triangleIdx, worldV0.x, worldV0.y, worldV0.z,
+					worldV1.x, worldV1.y, worldV1.z, worldV2.x, worldV2.y, worldV2.z);
+			}*/
+
+			if (isMesh)
+			{
+				//PxPlane triPlane(worldV0, worldV1, worldV2);
+				//get rid of double-sided triangle
+				//if (triPlane.distance(sphereCenter) >= 0)
+				{
+					intersect = triBoundingBox(worldV0, worldV1, worldV2).intersects(mBox);
+				}
+			}
+			else
+			{
+				intersect = triBoundingBox(worldV0, worldV1, worldV2).intersects(mBox);
+			}
+		}
+	
+		pushOntoStackFullWarp(intersect, midphasePairs, stackPtr, stackSize, make_uint4(cmIdx, globalParticleIndex, primitiveIndex, 0));
+	}
+
+	PX_FORCE_INLINE __device__ bool intersectBoxFullWarp(bool hasBox, const PxVec3& min, const PxVec3& max) const
+	{
+		if (hasBox)
+			return mBox.intersects(PxBounds3(min, max));
+		else
+			return false;
+	}
+};
+
+extern "C" __global__
+//__launch_bounds__(MIDPHASE_WARPS_PER_BLOCK * WARP_SIZE, 4)
+void cloth_psMidphaseGeneratePairsLaunch(
+	PxU32 numWorkItems,
+	const PxReal tolerenceLength,
+	const PxgContactManagerInput* PX_RESTRICT cmInputs,
+	const PxgShape* PX_RESTRICT gpuShapes,
+	const PxsCachedTransform* PX_RESTRICT transformCache,
+	const PxBounds3* PX_RESTRICT bounds,
+	const PxReal* PX_RESTRICT contactDistance,
+	const PxgFEMCloth* PX_RESTRICT clothes,
+	PxgParticleSystem* PX_RESTRICT particleSystems,
+
+	const PxU32 stackSizeBytes,
+	PxU8* PX_RESTRICT stackPtr,							//output
+	PxU32* PX_RESTRICT midphasePairsNum					//output
+)
+{
+	__shared__ __align__(16) PxU8 particleSystemMemory[sizeof(PxgParticleSystem)];
+	PxgParticleSystem& sParticleSystem = *(reinterpret_cast<PxgParticleSystem*>(particleSystemMemory));
+	__shared__ femMidphaseScratch scratchMem[MIDPHASE_WARPS_PER_BLOCK];
+
+	femMidphaseScratch* s_warpScratch = &scratchMem[threadIdx.y];
+
+	if (blockIdx.y < numWorkItems)
+	{
+		//const PxU32 globalThreadIdx = threadIdx.y * WARP_SIZE + threadIdx.x + blockIdx.x*blockDim.x*blockDim.y;
+		const PxU32 globalWarpIdx = threadIdx.y + blockIdx.x*blockDim.y;
+
+		PxgShape particleShape, clothShape;
+		PxU32 particleCacheRef, clothCacheRef;
+		LoadShapePairWarp<PxGeometryType::ePARTICLESYSTEM, PxGeometryType::eTRIANGLEMESH>(cmInputs, blockIdx.y, gpuShapes,
+			particleShape, particleCacheRef, clothShape, clothCacheRef);
+
+		const PxU32 particleSystemId = particleShape.particleOrSoftbodyId;
+
+		//each thread read 16 byte
+		uint4* srcParticleSystem = reinterpret_cast<uint4*>(&particleSystems[particleSystemId]);
+		uint4* dstParticleSystem = reinterpret_cast<uint4*>(&sParticleSystem);
+		warpCopy<uint4>(dstParticleSystem, srcParticleSystem, sizeof(PxgParticleSystem));
+
+
+		const PxU32 clohtId = clothShape.particleOrSoftbodyId;
+		const PxgFEMCloth& cloth = clothes[clohtId];
+
+		PxU8 * tetmeshGeomPtr = reinterpret_cast<PxU8 *>(cloth.mTriMeshData);
+
+		//const uint4 nbVerts_nbTets_maxDepth_nbBv32TreeNodes = *reinterpret_cast<const uint4 *>(tetmeshGeomPtr);
+		tetmeshGeomPtr += sizeof(uint4);
+
+		/*if (workIndex == 0 && threadIdx.x == 0)
+		printf("particleSystemId %i softbodyId %i\n", particleSystemId, softbodyId);*/
+
+		if (threadIdx.x == 0)
+		{
+			s_warpScratch->meshVerts = cloth.mPosition_InvMass;
+			s_warpScratch->meshVertsIndices = cloth.mTriangleVertexIndices;
+
+
+			//const PxU32 & numVerts = nbVerts_nbTets_maxDepth_nbBv32TreeNodes.x;
+			//const PxU32 & numTets = nbVerts_nbTets_maxDepth_nbBv32TreeNodes.y;
+			//const PxU32 & maxDepth = nbVerts_nbTets_maxDepth_nbBv32TreeNodes.z;
+			//const PxU32 & nbBv32PackedNodes = nbVerts_nbTets_maxDepth_nbBv32TreeNodes.w;
+
+			//printf("maxDepth %i numVerts %i numTets %i nbBv32TreeNodes %i\n", maxDepth, numVerts, numTets, nbBv32TreeNodes);
+
+			Gu::BV32DataPacked* bv32PackedNodes = reinterpret_cast<Gu::BV32DataPacked*>(tetmeshGeomPtr);
+			s_warpScratch->bv32PackedNodes = bv32PackedNodes;
+
+			/*tetmeshGeomPtr += sizeof(const Gu::BV32DataPacked)* nbBv32PackedNodes
+				+ sizeof(const Gu::BV32DataDepthInfo) * nbVerts_nbTets_maxDepth_nbBv32TreeNodes.z
+				+ sizeof(PxU32) * nbVerts_nbTets_maxDepth_nbBv32TreeNodes.w;*/
+
+			/*const PxU8* surfaceHint = reinterpret_cast<const PxU8*>(tetmeshGeomPtr);
+			s_warpScratch->tetmeshSurfaceHint = surfaceHint;*/
+
+		}
+
+		__syncthreads();
+
+		PxReal cDistance = contactDistance[particleCacheRef] + contactDistance[clothCacheRef];
+
+		//const PxU32 threadBaseAddress = globalThreadIdx & (~31);
+
+		const PxU32 nbParticles = sParticleSystem.mCommonData.mNumParticles;
+
+		//const PxU32 NbThreads = blockDim.x*blockDim.y*gridDim.x;
+		const PxU32 NbWarps = blockDim.y*gridDim.x;
+
+		float4* sortedPose = reinterpret_cast<float4*>(sParticleSystem.mSortedPositions_InvMass);
+
+		uint4* tStackPtr = reinterpret_cast<uint4*>(stackPtr);
+		const PxU32 stackSize = stackSizeBytes / sizeof(uint4);
+
+		//for (PxU32 i = threadBaseAddress, particleIndex = globalThreadIdx; i < nbParticles; i+= NbThreads, particleIndex += NbThreads)
+		for (PxU32 i = globalWarpIdx; i < nbParticles; i += NbWarps)
+		{
+			float4 pos = make_float4(0.f);
+			//PxU32 particleIndex = 0xFFFFFFFF;
+
+			//if(threadIdx.x == 0)
+			PxU32 particleIndex = i;
+			pos = sortedPose[particleIndex];
+
+
+			/*if (particleIndex < nbParticles)
+			{
+				pos = sortedPose[particleIndex];
+			}*/
+
+			const PxVec3 particlePos(pos.x, pos.y, pos.z);
+
+			bv32TreeTraversal<const ClothParticleTreeTraverser, MIDPHASE_WARPS_PER_BLOCK>(s_warpScratch->bv32PackedNodes, s_warpScratch->sBv32Nodes, ClothParticleTreeTraverser(s_warpScratch, particlePos, cDistance, particleIndex,
+				nbParticles, blockIdx.y, stackSize, tStackPtr, midphasePairsNum, false));
+		}
+	}
+}
+
+
+
+extern "C" __global__
+//__launch_bounds__(MIDPHASE_WARPS_PER_BLOCK * WARP_SIZE, 4)
+void cloth_midphaseVertexMeshLaunch(
+	PxU32 numWorkItems,
+	const PxReal tolerenceLength,
+	const PxgContactManagerInput* PX_RESTRICT cmInputs,
+	const PxsCachedTransform* PX_RESTRICT transformCache,
+	const PxBounds3* PX_RESTRICT bounds,
+	const PxReal* PX_RESTRICT contactDistance,
+	const PxgShape* PX_RESTRICT gpuShapes,
+	const PxgFEMCloth* PX_RESTRICT femClothes,
+
+	const PxU32 stackSizeBytes,
+	PxU8* PX_RESTRICT stackPtr,							//output
+	PxU32* PX_RESTRICT midphasePairsNum					//output
+)
+{
+
+	__shared__ femMidphaseScratch scratchMem[MIDPHASE_WARPS_PER_BLOCK];
+
+	femMidphaseScratch* s_warpScratch = &scratchMem[threadIdx.y];
+
+	const PxU32 cmIdx = blockIdx.y;
+
+	//if (cmIdx < numWorkItems)
+	{
+		//const PxU32 globalThreadIdx = threadIdx.y * WARP_SIZE + threadIdx.x + blockIdx.x*blockDim.x*blockDim.y;
+		const PxU32 globalWarpIdx = threadIdx.y + blockIdx.x*blockDim.y;
+
+		PxgShape shape0, shape1;
+		PxgShape* trimeshShape = NULL;
+		PxgShape* clothShape = NULL;
+		PxU32 trimeshCacheRef, clothCacheRef;
+		{
+			PxgContactManagerInput npWorkItem;
+			PxgContactManagerInput_ReadWarp(npWorkItem, cmInputs, cmIdx);
+			PxgShape_ReadWarp(shape0, gpuShapes + npWorkItem.shapeRef0);
+			PxgShape_ReadWarp(shape1, gpuShapes + npWorkItem.shapeRef1);
+			assert(shape0.type == PxGeometryType::eTRIANGLEMESH && shape1.type == PxGeometryType::eTRIANGLEMESH);
+
+			if (shape0.particleOrSoftbodyId != 0xffffffff)
+			{
+				trimeshShape = &shape1;
+				trimeshCacheRef = npWorkItem.transformCacheRef1;
+				clothShape = &shape0;
+				clothCacheRef = npWorkItem.transformCacheRef0;
+			}
+			else
+			{
+				trimeshShape = &shape0;
+				trimeshCacheRef = npWorkItem.transformCacheRef0;
+				clothShape = &shape1;
+				clothCacheRef = npWorkItem.transformCacheRef1;
+			}
+			assert(trimeshShape->particleOrSoftbodyId == 0xffffffff);
+		}
+
+		PxU32 clothId = clothShape->particleOrSoftbodyId;
+		const PxgFEMCloth* cloth = &femClothes[clothId];
+
+		PxsCachedTransform trimeshTransformCache;
+		PxsCachedTransform_ReadWarp(trimeshTransformCache, transformCache + trimeshCacheRef);
+
+		if (threadIdx.x == 0)
+		{
+			const PxU8* triGeomPtr = reinterpret_cast<const PxU8 *>(trimeshShape->hullOrMeshPtr);			
+			readTriangleMesh(triGeomPtr, s_warpScratch->bv32PackedNodes, s_warpScratch->meshVerts, s_warpScratch->meshVertsIndices);
+		}
+
+		__syncthreads();
+
+		PxTransform& trimeshToWorld = trimeshTransformCache.transform;
+		const PxMeshScale& trimeshScale = trimeshShape->scale;
+
+		PxReal cDistance = contactDistance[clothCacheRef] + contactDistance[trimeshCacheRef];
+
+		//const PxU32 threadBaseAddress = globalThreadIdx & (~31);
+
+		const PxU32 numVerts = cloth->mNbVerts;
+
+		const float4* positions = cloth->mPosition_InvMass;
+
+		//const PxU32 NbThreads = blockDim.x*blockDim.y*gridDim.x;
+		const PxU32 NbWarps = blockDim.y*gridDim.x;
+
+		uint4* tStackPtr = reinterpret_cast<uint4*>(stackPtr);
+		const PxU32 stackSize = stackSizeBytes / sizeof(uint4);
+
+		for (PxU32 particleIndex = globalWarpIdx; particleIndex < numVerts; particleIndex += NbWarps)
+		{
+
+			const float4 pos = positions[particleIndex];
+
+			//particle pos need to be in triangle mesh vertex space
+			const PxVec3 particlePos(pos.x, pos.y, pos.z);
+
+			/// transform those point to triangle vertex space
+			const PxVec3 localParticlePos = shape2Vertex(trimeshToWorld.transformInv(particlePos), trimeshScale.scale, trimeshScale.rotation);
+
+			bv32TreeTraversal<const ClothParticleTreeTraverser, MIDPHASE_WARPS_PER_BLOCK>(s_warpScratch->bv32PackedNodes, s_warpScratch->sBv32Nodes, ClothParticleTreeTraverser(s_warpScratch, localParticlePos, cDistance, particleIndex,
+				numVerts, cmIdx, stackSize, tStackPtr, midphasePairsNum, true));
+		}
+
+	}
+}
+
+
+
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/femClothPrimitives.cu
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/femClothPrimitives.cu
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/gjk.cuh
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/gjk.cuh
@@ -0,0 +1,422 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __GJK_CUH__
+#define __GJK_CUH__
+
+namespace squawk
+{
+// The simplex is stored as 4 vectors and then 4 normals (so we can shoot it straight into EPA)
+// The 3 component of each vector contains 3f80 in the high bits (so it's a valid float) and 
+// the vertex indices for a and b in the lower bits
+
+typedef PxReal Simplex;
+
+static __device__ inline Simplex setNormal0(Simplex z, PxReal nV, PxU32 index)
+{
+	PxReal tmpV = splatV(index, nV);
+	return threadIdx.x < 16 || threadIdx.x > 19 ? z : tmpV;
+}
+
+static __device__ inline Simplex setNormal123(Simplex z, PxReal nV)
+{
+	PxReal tmpV = __shfl_up_sync(FULL_MASK, nV, 20);
+	return threadIdx.x < 20 ? z : tmpV;
+}
+
+static __device__ inline Simplex loadSimplex(const schlock::GjkCachedData& data)
+{
+	return reinterpret_cast<const PxReal*>(&data)[threadIdx.x];
+}
+
+static __device__ inline void storeSimplex(schlock::GjkCachedData& data, Simplex z)
+{
+	reinterpret_cast<PxReal*>(&data)[threadIdx.x] = z;
+}
+
+static __device__ inline void storeSimplex(volatile schlock::GjkCachedData& data, Simplex z)
+{
+	reinterpret_cast<volatile PxReal*>(&data)[threadIdx.x] = z;
+}
+
+// suffixes for vector warps: 
+// * V = 3 vectors: (aX,aY,aZ,_,bX,bY,bZ,_,cX,cY,cZ,_)
+// * D = dense scalar: (a,a,a,a,b,b,b,b,c,c,c,c)
+// * S = sparse scalar: (a,_,_,_,b,_,_,_,c,_,_,_)
+// * I = index vector for shuffle
+
+#define DEGENERATE 0xffff
+
+static __device__ inline PxU32 closestPointTri(Simplex &zV, PxReal cV, PxReal& closestPoint)
+{
+	PxU32 tI = threadIdx.x, cp0I = (tI&28)|next3(tI&3), cp1I = (tI&28)|prev3(tI&3);		// cross product indices
+
+	closestPoint = 0;
+	PxReal eV = cV - zV, lD = splatX(dot(eV, eV));										// edges to apex & squared lengths
+	if(anyX<2>(lD==0))																	
+		return DEGENERATE;																// FAIL: apex coincident with a base vert
+
+	PxReal tmp = __shfl_xor_sync(FULL_MASK, eV, 4), shortEdgeV = __shfl_xor_sync(FULL_MASK, lD, 4) < lD ? tmp : eV;			// shorter edge to apex
+	PxReal mV = cross(__shfl_xor_sync(FULL_MASK, zV,4)-zV, shortEdgeV, cp0I, cp1I);						// (normal, -normal)
+	
+	// lots of dot products here with cV, (plus proj below). TODO: optimize
+	PxReal pS = dot(eV,cV);																// projection of origin onto edges
+	bool uS = dot(cross(mV, eV, cp0I, cp1I), cV) <= 0;									// origin outside edge planes
+	
+	if(!anyX<2>(uS))																	// REGION: face - inside both edge planes
+	{
+		PxReal projD = __shfl_sync(FULL_MASK, dot(mV,cV),0);												// project onto face
+		closestPoint = splatV(0, mV*projD/splatX(dot(mV,mV)));								
+		bool flip = projD<0;															// ensure the normal winding makes it outward-facing in the tet
+		mV = (tI & 3) == 3 ? projD : mV; 
+		zV = setNormal0(zV, mV, flip);
+		return flip ? 0x0123u : 0x1023u;
+	}
+
+	PxU32 eid = __ffs(packXFlags<2>(uS & bool(pS>0)));									// REGION: edges - outside the edge plane, and projects onto the edge
+	if(eid--)																			// __ffs returns 0 for no bit, 1 for lsb
+	{
+		closestPoint = splatV(eid, cV - eV*splatX(pS)/lD);
+		return eid<<8 | 0x22;
+	}
+
+	closestPoint = cV;																	// REGION: apex - not an edge, not a face
+	return 0x0021;
+}
+
+
+static __device__ inline PxU32 closestPointTet(Simplex &zV, PxReal dV, PxReal& closestPoint)
+{
+	closestPoint = 0;
+
+	PxReal n0V = splatV(4, zV);
+	if(__shfl_sync(FULL_MASK, dot(n0V, dV) > dot(n0V, zV), 0))											// FAIL: new vertex is beyond old face (TODO: set up normal in UT)
+		return DEGENERATE;
+	
+	PxU32 tI = threadIdx.x, cp0I = (tI&28)|next3(tI&3), cp1I = (tI&28)|prev3(tI&3);		// cross product indices
+	PxU32 vNextI = tI>7 ? tI-8 : tI+4, vPrevI = tI<4 ? tI + 8 : tI-4;					// block rotate indices
+	PxReal eV = dV - zV, lD = splatX(dot(eV, eV));										// edges to apex and squared lengths
+	if(anyX<3>(lD==0))
+		return DEGENERATE;																// FAIL: apex is coincident with a base vert
+	
+	PxReal tmp = __shfl_sync(FULL_MASK, eV, vNextI), shortEdgeV = __shfl_sync(FULL_MASK, lD, vNextI) < lD ? tmp : eV;		// shorter of edges to apex for each tri
+	PxReal nV = cross(shortEdgeV, __shfl_sync(FULL_MASK, zV, vNextI) - zV, cp0I, cp1I);					// face normals, outward facing 
+	PxReal aS = dot(nV, dV);															// distances along normals of apex
+	PxReal gS = dot(__shfl_sync(FULL_MASK, zV, vPrevI), nV);												// distances along normal of vert not in this triangle
+	
+	if(anyX<3>(aS<=gS))
+		return DEGENERATE;																// FAIL: simplex inversion/collapse
+
+	PxReal tmpV = __shfl_up_sync(FULL_MASK, aS, 3);
+	nV = (tI & 3) == 3 ? tmpV : nV; 
+
+	if(allX<3>(aS>0))																	// REGION: inside simplex
+	{
+		zV = setNormal123(zV, nV);														// set normals for EPA start
+		return 0x2104;
+	}
+	
+	const PxReal pS = dot(eV,dV);														// projection of origin onto edges
+	if(allX<3>(pS<=0))																	// REGION: apex
+	{
+		closestPoint = dV;
+		return 0x2131;
+	}
+
+	bool uiS = dot(cross(eV, nV, cp0I, cp1I), dV) <= 0;									// origin outside one edge plane of each triangle
+	bool ujS = dot(cross(__shfl_sync(FULL_MASK, eV,vNextI), nV, cp0I, cp1I), dV) > 0;						// origin outside other edge plane of each triangle
+	int eid = __ffs(packXFlags<3>(uiS & __shfl_sync(FULL_MASK, ujS, vPrevI) & bool(pS>=0)));				// REGION: edges - origin outside edge planes and projects onto edge
+	if(eid--)																			// __ffs returns 0 for no bit, 1 for lsb
+	{
+		closestPoint = splatV(eid, dV - eV * splatX(pS/lD));
+		return eid<<8| 0x32;
+	}
+
+	PxU32 fid = __ffs(packXFlags<3>(!uiS & !ujS & bool(aS<=0)));						// REGION: faces - origin outside face plane, inside edge planes
+	if(fid--)																			// __ffs returns 0 for no bit, 1 for lsb
+	{
+		closestPoint = splatV(fid, nV * splatX(aS / dot(nV, nV)));
+		zV = setNormal0(zV, -nV, fid);													// flip normal to be outward facing for next iteration
+		return next3(fid)<<12 | fid<<8| 0x33;
+	}
+
+	return DEGENERATE;																	// FAIL: couldn't find voronoi region. Chort vozmi!
+}
+
+static __device__ inline PxU8 supportIndex(const PxVec3* v, PxU8 nbVertices, PxReal dir)
+{
+	int bestN = 0xFFFFFFFF; //Ininitialize to a very illegal value to force a crash when we get nans
+	PxReal bestDist = -FLT_MAX;
+	PxReal x = __shfl_sync(FULL_MASK, dir, 0), y = __shfl_sync(FULL_MASK, dir, 1), z = __shfl_sync(FULL_MASK, dir, 2);
+	for(PxU32 i=0;i<nbVertices;i+=32)
+	{
+		int n = i+threadIdx.x;
+		PxReal d = n < nbVertices ? x*v[n].x + y*v[n].y + z*v[n].z : -FLT_MAX, e;
+
+		e = fmaxf(d, __shfl_xor_sync(FULL_MASK, d, 16));
+		e = fmaxf(e, __shfl_xor_sync(FULL_MASK, e, 8));
+		e = fmaxf(e, __shfl_xor_sync(FULL_MASK, e, 4));
+		e = fmaxf(e, __shfl_xor_sync(FULL_MASK, e, 2));
+		e = fmaxf(e, __shfl_xor_sync(FULL_MASK, e, 1));
+
+		unsigned mask_bestDist = __ballot_sync(FULL_MASK, e > bestDist);
+		if(e>bestDist)
+			bestDist = e, bestN = i+__ffs(__ballot_sync(mask_bestDist, e == d))-1;
+	}
+
+	assert(bestN < nbVertices);
+	return PxU8(bestN);
+}
+
+static __device__ inline PxU8 supportIndex(const PxVec3& v0, const PxVec3& v1, PxU8 nbVertices, PxReal dir)
+{
+	int bestN = 0xFFFFFFFF;
+	//PxReal bestDist = -FLT_MAX;
+	PxReal x = __shfl_sync(FULL_MASK, dir, 0), y = __shfl_sync(FULL_MASK, dir, 1), z = __shfl_sync(FULL_MASK, dir, 2);
+	
+	PxReal d = -FLT_MAX;
+	PxU32 n = threadIdx.x;
+	if (threadIdx.x < nbVertices)
+	{
+		d = x*v0.x + y*v0.y + z*v0.z;
+
+		if ((threadIdx.x + 32) < nbVertices)
+		{
+			PxReal d1 = x*v0.x + y*v0.y + z*v0.z;
+			if (d1 > d)
+			{
+				d = d1;
+				n += 32;
+			}
+		}
+	}
+	
+	PxReal e = fmaxf(d, __shfl_xor_sync(FULL_MASK, d, 16));
+	e = fmaxf(e, __shfl_xor_sync(FULL_MASK, e, 8));
+	e = fmaxf(e, __shfl_xor_sync(FULL_MASK, e, 4));
+	e = fmaxf(e, __shfl_xor_sync(FULL_MASK, e, 2));
+	e = fmaxf(e, __shfl_xor_sync(FULL_MASK, e, 1));
+
+	//bestDist = e;
+	bestN = __ffs(__ballot_sync(FULL_MASK, e == d)) - 1;
+	bestN += __popc(__ballot_sync(FULL_MASK, n >= 32) & (1 << bestN)) * 32;
+
+	assert(bestN < nbVertices);
+	
+	return PxU8(bestN);
+}
+
+
+static __device__ inline schlock::GjkResult::Enum gjk(
+										const PxVec3* vA, PxU32 nbA,
+										const PxVec3* vB, PxU32 nbB,	
+										const PxVec3& initialUnitDir, 
+										PxReal minSep, 
+										PxReal convergenceRatio,
+										schlock::GjkOutput& output, schlock::GjkCachedData& cacheData)
+{
+	int cacheMask = (1<< cacheData.size)-1;
+	__syncwarp(); //cacheData.size is written further below
+	Simplex simplexV = loadSimplex(cacheData);
+
+restart:
+	bool warmStarted = false;
+	Simplex backupSimplexV;
+	int size = 0, backupSize = 0;
+	PxReal closestPointV, backupClosestV;
+	
+	if (cacheMask)
+		closestPointV = splatV(0, simplexV);
+	else
+	{
+		if ((threadIdx.x & 3) == 0)
+			closestPointV = initialUnitDir.x;
+		else if ((threadIdx.x & 3) == 1)
+			closestPointV = initialUnitDir.y;
+		else if ((threadIdx.x & 3) == 2)
+			closestPointV = initialUnitDir.z;
+		else
+			closestPointV = PX_MAX_F32;
+	}
+
+	PxReal upperBound = PX_MAX_F32, lowerBound = -PX_MAX_F32, cm = magnitude(closestPointV); // if we hit a lower bound dir with distance exactly zero, -FLT_MIN means we keep it
+	bool degenerate;
+
+	do
+	{	
+		PxReal dV = closestPointV/cm, supportVertexV;
+
+#if GJK_LOGGING
+		PxReal dx = dV, dy = shfl(dV, 1), dz = shfl(dV,2);
+		PxReal cpx = closestPointV, cpy = shfl(closestPointV, 1), cpz = shfl(closestPointV,2);
+		if(threadIdx.x == 0)
+			printf("[%f, %f], dir (%f, %f, %f), closestPoint (%f, %f, %f), cm %f \n", lowerBound, upperBound, dx, dy, dz, cpx, cpy, cpz, cm);
+#endif
+
+		// find a warm start entry that can be added to the simplex because it's closer than the closest feature
+		int candidates = 0;
+		
+		if (cacheMask)
+			candidates = __ballot_sync(FULL_MASK, __shfl_sync(FULL_MASK, dot(dV, simplexV)<upperBound, threadIdx.x<<2)) & cacheMask;
+
+		if(candidates)		
+		{
+			warmStarted = true;
+			cacheMask &= ~(lowestSetBit(candidates)*2-1);
+			supportVertexV = splatV(lowestSetIndex(candidates), simplexV);
+			if(threadIdx.x>>2 == size)
+				simplexV = supportVertexV;
+		}
+		else				// if not, stop warm-starting and start using support vertices
+		{
+			cacheMask = 0;
+			PxU8 indexA = supportIndex(vA, nbA, -dV), indexB = supportIndex(vB, nbB, dV);
+			supportVertexV = loadV3Unsafe(vA+indexA) - loadV3Unsafe(vB+indexB);
+			if(threadIdx.x>>2 == size)
+				simplexV = (threadIdx.x&3) < 3 ? supportVertexV : __int_as_float(schlock::encodeIndices(indexA, indexB));
+			PxReal lb = splatX(dot(dV,supportVertexV));
+			if(lb>lowerBound)
+			{
+				storeV3(&output.direction, 0, -dV);
+				lowerBound = lb;
+				if(lowerBound > minSep || lowerBound>=upperBound*convergenceRatio)
+					break;
+			}
+		}
+	
+#if GJK_LOGGING
+		PxReal svx = supportVertexV, svy = shfl(supportVertexV,1), svz = shfl(supportVertexV,2);
+		if(threadIdx.x == 0)
+			printf("support (%f, %f, %f)\n", svx, svy, svz);
+#endif
+
+		PxU32 f = 0;											// f is feature index: (i2<<12 | i1<<8 | i0<<4 | size)
+		PxReal pointV;
+		switch(size)
+		{
+		case 0:		f = 0x2101, pointV = supportVertexV; break;	// have to be a bit for 0,1 cases not to trash the cached verts, which also live in the simplex
+		case 1:	
+		{
+			PxReal aV = splatV(0,simplexV), lV = supportVertexV - aV, m = splatX(dot(lV,lV)), x = -splatX(dot(aV,lV));			
+			if(m==0)		f = DEGENERATE, pointV = 0;							// FAIL: coincident vertex
+			else if(x<m)	f = 0x2102,		pointV = aV + lV * x/m;				// REGION: line segment
+			else 			f = 0x2111,		pointV = supportVertexV;			// REGION: new vertex
+			break;
+		}
+		case 2:	f = squawk::closestPointTri(simplexV, supportVertexV, pointV);	break; // no worries for 2 & 3: only (0,1,2) are affected by permute, and they're already used
+		case 3:	f = squawk::closestPointTet(simplexV, supportVertexV, pointV);	break;
+		}
+
+		cm = magnitude(pointV);
+		int newSize = f&0xf;				
+		degenerate = f == DEGENERATE || (cm>=upperBound && newSize<=size);
+		if(degenerate)
+		{
+			if(!warmStarted)	// hit a problem warm-starting: just restart cold
+				break;
+			cacheMask = 0;
+			goto restart;
+		}
+
+		// local minima happen. Continue so long as either the simplex size increases or the upper bound decreases, 
+		backupSize = cm >= upperBound ? size : 0;
+		if(backupSize)
+			backupSimplexV = simplexV, backupClosestV = closestPointV;	// save the best simplex and size before updating state
+		else
+			upperBound = cm;
+
+#if GJK_LOGGING
+		PxReal px = pointV, py = shfl(pointV,1), pz = shfl(pointV,2);
+		if(threadIdx.x == 0)
+			printf("%04x (%f, %f, %f) %f\n", f, px, py, pz, cm);
+#endif		
+
+		closestPointV = pointV, size = newSize;										// update closest point and simplex
+		PxU32 sourceIndex = f>>((threadIdx.x&28)+4)&0xf;							// +4 because size is the first nibble
+		PxReal tmp = __shfl_sync(FULL_MASK, simplexV, sourceIndex<<2 | (threadIdx.x&3));
+		if(threadIdx.x<12)
+			simplexV = tmp;					
+	}
+	while(upperBound > 0 && lowerBound < upperBound * convergenceRatio);
+
+	if(backupSize>0)
+		simplexV = backupSimplexV, closestPointV = backupClosestV, size = backupSize;
+	
+	__syncwarp();
+	storeSimplex(cacheData, simplexV);
+
+	if(threadIdx.x == 0)
+	{
+		output.degenerate = degenerate;
+		output.lowerBound = lowerBound;
+		output.upperBound = upperBound;
+		cacheData.size = size;
+	}
+
+	__syncwarp();
+
+	if(upperBound == 0)
+		return schlock::GjkResult::eOVERLAP;
+
+	if(lowerBound>minSep)
+		return schlock::GjkResult::eDISTANT;
+
+	PxU8 a0 = cacheData.vertices[0].a(), a1 = cacheData.vertices[1].a(), a2 = cacheData.vertices[2].a();
+	PxU8 b0 = cacheData.vertices[0].b(), b1 = cacheData.vertices[1].b(), b2 = cacheData.vertices[2].b();
+
+	PxVec3 closestPoint(closestPointV, __shfl_sync(FULL_MASK, closestPointV,1), __shfl_sync(FULL_MASK, closestPointV,2));
+
+	if(threadIdx.x == 0)
+	{
+		PxVec3 closestPointDir = -closestPoint.getNormalized();
+		output.closestPointDir = closestPointDir;
+
+		PxReal p, q, r;
+		if(size == 3)
+		{
+			schlock::barycentrics(cacheData.vertices[0].getV(), cacheData.vertices[1].getV(), cacheData.vertices[2].getV(), p, q, r, closestPoint);
+			output.closestPointA = p * vA[a0] + q * vA[a1] + r * vA[a2];
+		}
+		else if(size == 2)
+		{
+			PxVec3 d = cacheData.vertices[1].getV()- cacheData.vertices[0].getV();
+			PxReal q = -cacheData.vertices[0].getV().dot(d)/d.magnitudeSquared();
+			output.closestPointA = (1-q) * vA[a0] + q * vA[a1];
+		}
+		else
+			output.closestPointA = vA[a0];
+
+		output.closestPointB = output.closestPointA + upperBound * closestPointDir;
+	}
+
+	return schlock::GjkResult::eCLOSE;
+}
+}
+#undef DEGENERATE_SIZE_FLAG
+
+#endif
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/heightfieldUtil.cuh
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/heightfieldUtil.cuh
@@ -0,0 +1,443 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __HEIGHTFIELD_CUH__
+#define __HEIGHTFIELD_CUH__
+
+#include "geometry/PxHeightFieldSample.h"
+#include "geometry/PxMeshScale.h"
+#include "triangle.cuh"
+#include "assert.h"
+
+__device__	static inline physx::PxU32	getMinRow(physx::PxReal x, const physx::PxU32 rows)
+{
+	using namespace physx;
+	return (PxU32)PxClamp(PxI32(PxFloor(x)), PxI32(0), PxI32(rows - 2));
+}
+__device__	static inline physx::PxU32	getMaxRow(physx::PxReal x, const physx::PxU32 rows)
+{
+	using namespace physx;
+	return (PxU32)PxClamp(PxI32(PxCeil(x)), PxI32(0), PxI32(rows - 1));
+}
+
+__device__	static inline physx::PxU32	getMinColumn(physx::PxReal z, const physx::PxU32 columns)
+{
+	using namespace physx;
+	return (PxU32)PxClamp(PxI32(PxFloor(z)), PxI32(0), PxI32(columns - 2));
+}
+
+__device__	static inline  physx::PxU32	getMaxColumn(physx::PxReal z, const  physx::PxU32 columns)
+{
+	using namespace physx;
+	return (PxU32)PxClamp(PxI32(PxCeil(z)), PxI32(0), PxI32(columns - 1));
+}
+
+__device__	static inline bool isValidVertex(physx::PxU32 vertexIndex, const  physx::PxU32 rows, const  physx::PxU32 columns)
+{
+	using namespace physx;
+	return vertexIndex < rows*columns;
+}
+
+__device__	static inline  bool	isFirstTriangle(physx::PxU32 triangleIndex)
+{
+	return ((triangleIndex & 0x1) == 0);
+}
+
+__device__	static inline const physx::PxHeightFieldSample& getSample(physx::PxU32 vertexIndex, const physx::PxHeightFieldSample* samples)
+{
+
+	return samples[vertexIndex];
+}
+
+__device__	static inline  physx::PxReal getHeight(physx::PxU32 vertexIndex, const physx::PxHeightFieldSample* samples)
+{
+	using namespace physx;
+	return PxReal(getSample(vertexIndex, samples).height);
+}
+
+__device__	static inline  physx::PxU16	getMaterialIndex0(physx::PxU32 vertexIndex, const physx::PxHeightFieldSample* samples)
+{
+	using namespace physx;
+	return getSample(vertexIndex, samples).materialIndex0;
+}
+
+__device__	static inline  physx::PxU16	getMaterialIndex1(physx::PxU32 vertexIndex, const physx::PxHeightFieldSample* samples)
+{
+	using namespace physx;
+	return getSample(vertexIndex, samples).materialIndex1;
+}
+
+__device__ static inline bool isZerothVertexShared(physx::PxU32 vertexIndex, const physx::PxHeightFieldSample* samples)
+{
+	return getSample(vertexIndex, samples).tessFlag() != 0;
+}
+
+
+__device__ static inline physx::PxVec3 getVertex(physx::PxU32 vertexIndex, const physx::PxU32 columns, const physx::PxHeightFieldSample* samples)
+{
+	using namespace physx;
+	const PxU32 row = vertexIndex / columns;
+	const PxU32 column = vertexIndex % columns;
+	return PxVec3(PxReal(row), getHeight(vertexIndex, samples), PxReal(column));
+}
+
+__device__ static inline physx::PxVec3 hf2shapep(const physx::PxVec3& v, const physx::PxReal rowScale, const physx::PxReal heightScale, const physx::PxReal columnScale) 
+{
+	using namespace physx;
+	return PxVec3(v.x *rowScale, v.y * heightScale, v.z * columnScale);
+}
+
+__device__ static inline void getTriangleVertexIndices(physx::PxU32 triangleIndex, physx::PxU32& vertexIndex0, physx::PxU32& vertexIndex1, physx::PxU32& vertexIndex2, const physx::PxU32 columns, const physx::PxHeightFieldSample* samples)
+{
+	using namespace physx;
+	const PxU32 cell = triangleIndex >> 1;
+	if (isZerothVertexShared(cell, samples))
+	{
+		//      <---- COL  
+		//      0----2  1 R
+		//      | 1 /  /| O
+		//      |  /  / | W
+		//      | /  /  | |
+		//      |/  / 0 | |
+		//      1  2----0 V
+		//      
+		if (isFirstTriangle(triangleIndex))
+		{
+			vertexIndex0 = cell + columns;
+			vertexIndex1 = cell;
+			vertexIndex2 = cell + columns + 1;
+		}
+		else
+		{
+			vertexIndex0 = cell + 1;
+			vertexIndex1 = cell + columns + 1;
+			vertexIndex2 = cell;
+		}
+	}
+	else
+	{
+		//      <---- COL  
+		//      2  1----0 R
+		//      |\  \ 0 | O
+		//      | \  \  | W
+		//      |  \  \ | |
+		//      | 1 \  \| |
+		//      0----1  2 V
+		//                   
+		if (isFirstTriangle(triangleIndex))
+		{
+			vertexIndex0 = cell;
+			vertexIndex1 = cell + 1;
+			vertexIndex2 = cell + columns;
+		}
+		else
+		{
+			vertexIndex0 = cell + columns + 1;
+			vertexIndex1 = cell + columns;
+			vertexIndex2 = cell + 1;
+		}
+	}
+}
+
+__device__ static inline void getTriangleAdjacencyIndices(physx::PxU32 triangleIndex, physx::PxU32& adjacencyIndex0, physx::PxU32& adjacencyIndex1, physx::PxU32& adjacencyIndex2, const physx::PxU32 rows, const physx::PxU32 columns,
+	const physx::PxHeightFieldSample* samples)
+{
+	using namespace physx;
+	const PxU32 cell = triangleIndex >> 1;
+	if (isZerothVertexShared(cell, samples))
+	{
+		//      <---- COL  
+		//      0----2  1 R
+		//      | 1 /  /| O
+		//      |  /  / | W
+		//      | /  /  | |
+		//      |/  / 0 | |
+		//      1  2----0 V
+		//      
+		if (isFirstTriangle(triangleIndex))
+		{
+			adjacencyIndex0 = BOUNDARY;
+			adjacencyIndex1 = triangleIndex + 1;
+			adjacencyIndex2 = BOUNDARY;
+
+			if ((cell % (columns) != 0))
+			{
+				adjacencyIndex0 = triangleIndex - 1;
+			}
+
+			if ((cell / columns != rows - 2))
+			{
+				adjacencyIndex2 = ((cell + columns) * 2) + 1;
+			}
+		}
+		else
+		{
+			adjacencyIndex0 = BOUNDARY;
+			adjacencyIndex1 = triangleIndex - 1;
+			adjacencyIndex2 = BOUNDARY;
+
+			if (cell % (columns) < (columns - 2))
+			{
+				adjacencyIndex0 = triangleIndex + 1;
+			}
+
+			if (cell >= columns - 1)
+			{
+				adjacencyIndex2 = (cell - columns) * 2;
+			}
+		}
+	}
+	else
+	{
+		//      <---- COL  
+		//      2  1----0 R
+		//      |\  \ 0 | O
+		//      | \  \  | W
+		//      |  \  \ | |
+		//      | 1 \  \| |
+		//      0----1  2 V
+		//                   
+		if (isFirstTriangle(triangleIndex))
+		{
+			adjacencyIndex0 = BOUNDARY;
+			adjacencyIndex1 = triangleIndex + 1;
+			adjacencyIndex2 = BOUNDARY;
+
+			if (cell >= columns - 1)
+			{
+				adjacencyIndex0 = ((cell - (columns)) * 2) + 1;
+			}
+
+			if ((cell % (columns) != 0))
+			{
+				adjacencyIndex2 = triangleIndex - 1;
+			}
+		}
+		else
+		{
+			adjacencyIndex0 = BOUNDARY;
+			adjacencyIndex1 = triangleIndex - 1;
+			adjacencyIndex2 = BOUNDARY;
+
+			if ((cell / columns != rows - 2))
+			{
+				adjacencyIndex0 = (cell + (columns)) * 2;
+			}
+
+			if (cell % (columns) < (columns - 2))
+			{
+				adjacencyIndex2 = triangleIndex + 1;
+			}
+		}
+	}
+}
+
+
+
+__device__ static void getTriangle(physx::PxVec3& triLocV0, physx::PxVec3& triLocV1, physx::PxVec3& triLocV2, uint4* adjacencyIndices, const physx::PxU32 triangleIndex, const physx::PxMeshScale& scale, const physx::PxU32 rows, const physx::PxU32 columns, const physx::PxHeightFieldSample* samples)
+{
+	using namespace physx;
+
+	const PxReal rowScale = scale.scale.x;
+	const PxReal heightScale = scale.scale.y;
+	const PxReal columnScale = scale.scale.z;
+	PxVec3 handedness(1.0f);	// Vector to invert normal coordinates according to the heightfield scales
+	bool wrongHanded = false;
+	if (columnScale < 0.f)
+	{
+		wrongHanded = !wrongHanded;
+		handedness.z = -1.0f;
+	}
+	if (rowScale < 0.f)
+	{
+		wrongHanded = !wrongHanded;
+		handedness.x = -1.0f;
+	}
+
+	//PxU32 tVertexIndices[3];
+	//getTriangleVertexIndices(triangleIndex, tVertexIndices[0], tVertexIndices[1 + wrongHanded], tVertexIndices[2 - wrongHanded], columns, samples);
+
+	PxU32 tVert0, tVert1, tVert2;
+	getTriangleVertexIndices(triangleIndex, tVert0, tVert1, tVert2, columns, samples);
+
+	if (adjacencyIndices)
+	{
+		/*PxU32 tAdjInd0 = wrongHanded ? adjacencyIndices->z : adjacencyIndices->x;
+		PxU32 tAdjInd1 = adjacencyIndices->y;
+		PxU32 tAdjInd2 = wrongHanded ? adjacencyIndices->x : adjacencyIndices->z;*/
+
+		PxU32 tAdjInd0, tAdjInd1, tAdjInd2;
+		getTriangleAdjacencyIndices(triangleIndex, tAdjInd0, tAdjInd1, tAdjInd2, rows, columns, samples);
+
+		adjacencyIndices->x = wrongHanded ? tAdjInd2 : tAdjInd0;
+		adjacencyIndices->y = tAdjInd1;
+		adjacencyIndices->z = wrongHanded ? tAdjInd0 : tAdjInd2;
+	}
+
+
+	const PxVec3 vertex0 = getVertex(tVert0, columns, samples);
+	const PxVec3 vertex1 = getVertex(wrongHanded ? tVert2 : tVert1, columns, samples);
+	const PxVec3 vertex2 = getVertex(wrongHanded ? tVert1 : tVert2, columns, samples);
+
+	triLocV0 = hf2shapep(vertex0, rowScale, heightScale, columnScale);
+	triLocV1 = hf2shapep(vertex1, rowScale, heightScale, columnScale);
+	triLocV2 = hf2shapep(vertex2, rowScale, heightScale, columnScale);
+}
+
+__device__ static bool testLocalPoint(const physx::PxVec3& pnt, const physx::PxU32 rows, const physx::PxU32 columns, const physx::PxHeightFieldSample* samples, float& height)
+{
+	using namespace physx;
+
+    if (pnt.x < 0 || pnt.x > rows - 1 || pnt.z < 0 || pnt.z > columns - 1)
+        return false;
+
+    PxU32 row = getMinRow(pnt.x, rows);
+    PxU32 column = getMinColumn(pnt.z, columns);
+    PxU32 vertex = row * columns + column;
+    PxReal h0 = getHeight(vertex, samples);
+    PxReal h1 = getHeight(vertex + 1, samples);
+    PxReal h2 = getHeight(vertex + columns, samples);
+    PxReal h3 = getHeight(vertex + columns + 1, samples);
+    PxReal s = pnt.x - PxFloor(pnt.x);
+    PxReal t = pnt.z - PxFloor(pnt.z);
+    PxReal h = h0 * (1 - s) * (1 - t) + h1 * s * (1 - t) + h2 * (1 - s) * t + h3 * s * t; // @@@ That's wrong. Should find point on triangle
+	height = h;
+    return h > pnt.y;
+}
+
+
+__device__ static inline physx::PxU32 heightfieldComputePairs(
+	const physx::PxU32 minColumn,
+	const physx::PxU32 maxColumn,
+	const physx::PxU32 minRow,
+	const physx::PxU32 maxRow,
+	const physx::PxU32 nbRows,
+	const physx::PxU32 nbCols,
+	physx::PxHeightFieldSample* samples,
+	const physx::PxU32 miny,
+	const physx::PxU32 maxy
+)
+{
+	using namespace physx;
+
+	PxU32 nbPairs = 0;
+
+	const PxU32 columnSpan = maxColumn - minColumn;
+
+	//we have two materials corresponding to one vertexIndex, so each thread will deal with one of the materials
+	const PxU32 totalNumProcessed = (maxRow - minRow) * columnSpan * 2;
+
+	for (PxU32 i = 0; i < totalNumProcessed; ++i)
+	{
+
+		const PxU32 index = i / 2;
+		const PxU32 vertexIndex = (minRow + index / columnSpan) * nbCols + (minColumn + index % columnSpan);
+		assert(isValidVertex(vertexIndex, nbRows, nbCols));
+		PxReal h0 = getHeight(vertexIndex, samples);
+		PxReal h1 = getHeight(vertexIndex + 1, samples);
+		PxReal h2 = getHeight(vertexIndex + nbCols, samples);
+		PxReal h3 = getHeight(vertexIndex + nbCols + 1, samples);
+		const bool con0 = maxy < h0 && maxy < h1 && maxy < h2 && maxy < h3;
+		const bool con1 = miny > h0 && miny > h1 && miny > h2 && miny > h3;
+
+		if (!(con0 || con1))
+		{
+			const PxHeightFieldSample& sample = getSample(vertexIndex, samples);
+
+			const bool isMaterial1 = (i & 1) ? 1 : 0;
+			PxU32 material = isMaterial1 ? sample.materialIndex1 : sample.materialIndex0;
+			if (material != PxHeightFieldMaterial::eHOLE)
+				nbPairs++;
+		}
+	}//end of totalNumProcessed
+
+	return nbPairs;
+}
+
+__device__ static inline void heightfieldOutputPairs(
+	const physx::PxU32 minColumn,
+	const physx::PxU32 maxColumn,
+	const physx::PxU32 minRow,
+	const physx::PxU32 maxRow,
+	const physx::PxU32 nbRows,
+	const physx::PxU32 nbCols,
+	physx::PxHeightFieldSample* samples,
+	const physx::PxU32 miny,
+	const physx::PxU32 maxy,
+
+	const physx::PxU32 cmInd,
+	const physx::PxU32 elementInd, //either tetrahedron id or triangle id
+	const physx::PxU32 startOffset,
+	const physx::PxU32 size,
+
+	uint4* stackPtr						//output
+
+)
+{
+	using namespace physx;
+
+	PxU32 writeIndex = startOffset;
+	const PxU32 columnSpan = maxColumn - minColumn;
+
+	//we have two materials corresponding to one vertexIndex, so each thread will deal with one of the materials
+	const PxU32 totalNumProcessed = (maxRow - minRow) * columnSpan * 2;
+
+	for (PxU32 i = 0; i < totalNumProcessed && writeIndex < size; ++i)
+	{
+
+		PxU32 triangleIdx = 0xFFffFFff;
+
+		const PxU32 index = i / 2;
+		const PxU32 vertexIndex = (minRow + index / columnSpan) * nbCols + (minColumn + index % columnSpan);
+		assert(isValidVertex(vertexIndex, nbRows, nbCols));
+		PxReal h0 = getHeight(vertexIndex, samples);
+		PxReal h1 = getHeight(vertexIndex + 1, samples);
+		PxReal h2 = getHeight(vertexIndex + nbCols, samples);
+		PxReal h3 = getHeight(vertexIndex + nbCols + 1, samples);
+		const bool con0 = maxy < h0 && maxy < h1 && maxy < h2 && maxy < h3;
+		const bool con1 = miny > h0 && miny > h1 && miny > h2 && miny > h3;
+
+		if (!(con0 || con1))
+		{
+			const PxHeightFieldSample& sample = getSample(vertexIndex, samples);
+
+			const bool isMaterial1 = (i & 1) ? 1 : 0;
+			PxU32 material = isMaterial1 ? sample.materialIndex1 : sample.materialIndex0;
+			if (material != PxHeightFieldMaterial::eHOLE)
+			{
+				triangleIdx = isMaterial1 ? ((vertexIndex << 1) + 1) : (vertexIndex << 1);
+
+				stackPtr[writeIndex] = make_uint4(cmInd, elementInd, triangleIdx, 0);
+
+				writeIndex++;
+			}
+		}
+	}//end of totalNumProcessed
+}
+
+#endif
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/manifold.cuh
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/manifold.cuh
@@ -0,0 +1,128 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __MANIFOLD_CUH__
+#define __MANIFOLD_CUH__
+
+#include "foundation/PxTransform.h"
+#include "PxgPersistentContactManifold.h"
+#include "PxgCommonDefines.h"
+#include "dataReadWriteHelper.cuh"
+
+#define MULTIMANIFOLD_MAX_MANIFOLDS	4
+#define SUBMANIFOLD_MAX_CONTACTS	6
+
+__device__ static inline bool refreshManifolds(
+	const physx::PxTransform & aToB,
+	physx::PxReal projectBreakingThreshold,
+	physx::PxgPersistentContactMultiManifold * PX_RESTRICT multiManifold
+	)
+{
+	using namespace physx;
+
+	const PxU32 threadIdxInWarp = threadIdx.x & (WARP_SIZE - 1);
+	const PxReal sqProjectBreakingThreshold = projectBreakingThreshold * projectBreakingThreshold;
+	const PxU32 numManifolds = multiManifold->mNbManifolds;
+
+	PxU32 i = threadIdxInWarp;
+	{
+		bool lostContact = false;
+
+		PxU8 curManifoldIdx = i / SUBMANIFOLD_MAX_CONTACTS;		
+
+		// refreshContactPoints
+		if (curManifoldIdx < numManifolds)
+		{
+			PxU32 nbContacts = multiManifold->mNbContacts[curManifoldIdx];
+			PxU8 curContactIdx = i % SUBMANIFOLD_MAX_CONTACTS;
+
+			if (curContactIdx < nbContacts)
+			{
+				PxReal penNew;
+				PxgContact & curContact = multiManifold->mContacts[curManifoldIdx][curContactIdx];
+
+				PxVec3 pointA = curContact.pointA;
+				PxVec3 pointB = curContact.pointB;
+
+				const PxVec3 localAInB = aToB.transform(pointA); // from a to b
+				const PxVec3 v = localAInB - pointB;
+
+				PxVec3 normal = curContact.normal;
+				PxReal penOld = curContact.penetration;
+				penNew = v.dot(normal);
+
+				const PxVec3 projectedPoint = localAInB - normal * penOld;
+				const PxVec3 projectedDifference = pointB - projectedPoint;
+
+				const PxReal distance2d = projectedDifference.magnitudeSquared();
+
+				curContact.penetration = penNew;
+
+				if (distance2d > sqProjectBreakingThreshold)
+				{
+					lostContact = true;
+				}
+			}
+		}
+
+		// If we lost any contact - we do the full contact gen, we don't need any contacts then
+		if (__any_sync(FULL_MASK, (PxU32)lostContact))
+		{
+			if(threadIdxInWarp == 0)
+				multiManifold->mNbManifolds = 0;
+			return true;
+		}
+	}
+
+	return false;
+}
+
+__device__  static inline bool invalidateManifold(const physx::PxTransform & aToB, physx::PxgPersistentContactMultiManifold& multiManifold, 
+	const physx::PxReal minMargin, const PxReal ratio)
+{
+	using namespace physx;
+
+	PxAlignedTransform prevRelativeTransform;
+	PxAlignedTransform_ReadWarp(prevRelativeTransform, &multiManifold.mRelativeTransform);
+
+	const PxReal thresholdP = minMargin * ratio;
+	const PxReal thresholdQ = 0.9998f; //about 1 degree
+
+	const PxVec3 maxTransfPosDelta_delta = aToB.p - PxVec3(prevRelativeTransform.p.x, prevRelativeTransform.p.y, prevRelativeTransform.p.z);
+	const PxVec3 maxTransfPosDelta_absDelta(PxAbs(maxTransfPosDelta_delta.x), PxAbs(maxTransfPosDelta_delta.y), PxAbs(maxTransfPosDelta_delta.z));
+
+	const PxReal deltaP = PxMax(maxTransfPosDelta_absDelta.x, PxMax(maxTransfPosDelta_absDelta.y, maxTransfPosDelta_absDelta.z));
+	const PxReal deltaQ = aToB.q.dot((PxQuat)prevRelativeTransform.q);
+
+	if ((deltaP > thresholdP) || (deltaQ < thresholdQ))
+		return true;
+
+	return false;
+}
+
+#endif
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/materialCombiner.cuh
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/materialCombiner.cuh
@@ -0,0 +1,51 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __MATERIAL_COMBINER_CUH__
+#define __MATERIAL_COMBINER_CUH__
+
+#include "PxsMaterialCombiner.h"
+
+__device__ static __forceinline__
+void combineMaterials(const PxsMaterialData* PX_RESTRICT materials, PxU16 index0, PxU16 index1,
+	PxU32& combinedFlags,
+	PxReal& combinedStaticFriction,
+	PxReal& combinedDynamicFriction,
+	PxReal& combinedRestitution,
+	PxReal& combinedDamping
+)
+{
+	const PxsMaterialData& mat0 = materials[index0];
+	const PxsMaterialData& mat1 = materials[index1];
+
+	PxsCombineMaterials(mat0, mat1,
+		combinedStaticFriction, combinedDynamicFriction, 
+		combinedRestitution, combinedFlags, combinedDamping);
+}
+
+#endif
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/midphaseAllocate.cuh
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/midphaseAllocate.cuh
@@ -0,0 +1,165 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __MIDPHASE_ALLOCATION_CUH__
+#define __MIDPHASE_ALLOCATION_CUH__
+
+#include "foundation/PxSimpleTypes.h"
+#include "convexNpCommon.h"
+#include <vector_types.h>
+
+using namespace physx;
+
+/*
+	// TODO: read/write infos
+
+	AD 8.2023: some comments about all of this. The size of the collision stack is PxGpuDynamicsMemoryConfig::collisionStackSize
+
+    The following parameters are involved:
+    * numpairsAfterMP:        The number of convex - triangle pairs after the midphase. These are pairs between 1 triangle of a trimesh
+                              and a complete convex. Will be padded to a multiple of 4 during stack allocation to satisfy 16B alignment.
+    * numPaddedPairsAfterMP:  A pair here is again a single triangle of a trimesh and a complete convex. But in contrast to the one above,
+                              the list holding the triangle indices is padded to multiples of 4 and duplicated for each contact manager.
+                              This is done because we do a radix sort on these indices later, which happens on uint4s. So all the lists
+                              need to be padded to a multiple of 4, and we need the temp buffer as well. So this count represents the length
+                              of the flat list needed to place all the triangle indices, with per-CM padding and temp buffers.
+                              The list will look something like this (x is padding, y is temp buffer):
+
+                              cm 0            | cm 1            | cm 2 ..
+                              0 1 2 p t t t t | 3 4 p p t t t t | 5 6 7 8 9 p p p t t t t t t t t | ...
+
+	numPairsAfterMP4 = (numPairsAfterMP + 3) & (~3)
+
+	This is the layout of the collision stack (growing to the bottom):
+
+	|---------
+	|* PairsGPU: uint4 x numPairsAfterMP4
+	|    x: contact manager index
+	|    y: triangle index (of the triangle mesh)
+	|    z: offset from first triangle found for this contact manager
+	|    w: npWorkItem.shapeRef1 (ref of convex mesh)
+	|
+	|---------
+	|* TriNormalAndIndex: ConvexTriNormalAndIndex x numPairsAfterMP4
+	|    x,y,z: minNormal from convexTriangleContactGen  (confirm: the normal is from a convex mesh triangle, right?)
+	|    w:     remapCPUIndex of trimesh triangle index. (What? Why?)
+	|---------
+	|* Contacts: ConvexTriContacts x numPairsAfterMP4
+	|
+	|---------
+	|* TriangleIndices: PxU32 * numPaddedPairsAfterMP
+	|
+	|---------
+	|* maxDepth: PxReal * numPairsAfterMP4
+	|
+	|---------
+	|* intermediate data: ConvexTriIntermediateData * numPairsAfterMP4
+	|
+	|---------
+	|* secondPass data: PxU32 * numPairsAfterMP4
+
+	We only fill the pairs array in the midphase kernels. Everything else will be written in the core narrowphase kernels.
+	Not that pairs belonging to the same contact manager are not necessarily consecutive in the pairs list, because of the warp-wide
+	write in the midphase. We do maintain a count + offset into a per-CM list for each pair when filling the pairs list with atomics.
+	This count + offset is then used to calculate the index into per-CM lists to write all the other data in the core narrowphase kernels.
+
+*/
+
+__device__ __forceinline__ PxU32 calculateConvexMeshPairMemRequirement()
+{
+	return (sizeof(uint4)                     // sPairsGPU
+          + sizeof(ConvexTriNormalAndIndex)   // normal and index, 4 floats,
+          + sizeof(ConvexTriContacts)         // 4 contacts for convexTriContacts
+          + sizeof(PxReal)                    // maxdepth
+          + sizeof(ConvexTriIntermediateData) // intermediate data
+          + sizeof(PxU32)                     // secondPass indices
+		  + sizeof(PxU32) * 2);               // triangle indices + radix sort data. We do not account for the padding here.
+}
+
+// This is calculating the worst-case padding intentionally to avoid stack overflows!
+__device__ PX_FORCE_INLINE PxU32 calculateAdditionalPadding(const PxU32 numContactManagers)
+{
+	// the number of pairs are padded to a multiple of 4, so we need additional space for at most 3.
+	// we subtract the memory needed for the triangle index buffer here because that one is allocated with a different count.
+	const PxU32 afterPairPadding = 3 * (calculateConvexMeshPairMemRequirement() - 2 * sizeof(PxU32));
+
+	// for the triangle index buffer, we pad the triangle count for each contact manager to a multiple of 4.
+	// We also need the temp buffer for the radix sort, so that means we need to account for another
+	// 3 + 3 PxU32s per contact manager.
+	const PxU32 afterTriangleIndexPadding = numContactManagers * 6 * sizeof(PxU32);
+
+	return (afterPairPadding + afterTriangleIndexPadding);
+}
+
+__device__ PX_FORCE_INLINE PxU32 calculateMaxPairs(const PxU32 stackSizeBytes, const PxU32 numContactManagers)
+{
+	const PxU32 additionalPadding = calculateAdditionalPadding(numContactManagers);
+
+	const PxU32 memAvailableAfterPadding = stackSizeBytes - additionalPadding;
+	const PxU32 memRequiredPerPair = calculateConvexMeshPairMemRequirement();
+
+	return (memAvailableAfterPadding / memRequiredPerPair);
+}
+
+__device__ __forceinline__ void midphaseAllocate( ConvexTriNormalAndIndex** cvxTriNIGPU, ConvexTriContacts** cvxTriContactsGPU, 
+	PxReal** cvxTriMaxDepthGPU, ConvexTriIntermediateData** cvxTriIntermGPU, PxU32** orderedCvxTriIntermGPU,
+	PxU32** cvxTriSecondPassedGPU, uint4** pairsGPU,
+	PxU8 * stackPtr, const PxU32 numpairsAfterMPUnpadded, const PxU32 numPaddedPairsAfterMP)
+{
+	//Pad up to a multiple of 4 so we don't get misaligned addresses
+	const PxU32 numPairsAfterMP = (numpairsAfterMPUnpadded + 3)&(~3);
+
+	// I don't know why this line is needed.
+	PxU8 * newGpuIntermStackPtr = reinterpret_cast<PxU8* >(stackPtr);
+
+	// the pairs already exist, they have been written in the midphase kernel. So this allocation
+	// only makes sure we write all the other data behind it.
+	*pairsGPU = (uint4 *)newGpuIntermStackPtr;
+	newGpuIntermStackPtr += numPairsAfterMP * sizeof(uint4);
+
+	// Allocate intermediate arrays:
+	*cvxTriNIGPU = reinterpret_cast<ConvexTriNormalAndIndex* >(newGpuIntermStackPtr);
+	newGpuIntermStackPtr += numPairsAfterMP * sizeof(ConvexTriNormalAndIndex);
+
+	*cvxTriContactsGPU = reinterpret_cast<ConvexTriContacts* >(newGpuIntermStackPtr);
+	newGpuIntermStackPtr += numPairsAfterMP * sizeof(ConvexTriContacts); //ConvexTriContacts has space of 4 contacts
+
+	*orderedCvxTriIntermGPU = reinterpret_cast<PxU32*>(newGpuIntermStackPtr);
+	newGpuIntermStackPtr += numPaddedPairsAfterMP * sizeof(PxU32); // AD: see convexNpCommon.h - numPaddedPairsAfterMP is already padded up to multiple of 4 and duplicated for allow for a radix sort.
+
+	*cvxTriMaxDepthGPU = reinterpret_cast<PxReal*>(newGpuIntermStackPtr);
+	newGpuIntermStackPtr += numPairsAfterMP * sizeof(PxReal);
+
+	*cvxTriIntermGPU = reinterpret_cast<ConvexTriIntermediateData*>(newGpuIntermStackPtr);
+	newGpuIntermStackPtr += numPairsAfterMP * sizeof(ConvexTriIntermediateData);
+
+	*cvxTriSecondPassedGPU = reinterpret_cast<PxU32*>(newGpuIntermStackPtr);
+	newGpuIntermStackPtr += numPairsAfterMP * sizeof(PxU32);
+}
+
+#endif
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/nputils.cuh
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/nputils.cuh
@@ -0,0 +1,193 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __NPUTILS_CUH__
+#define __NPUTILS_CUH__
+
+#include "foundation/PxQuat.h"
+#include "foundation/PxTransform.h"
+#include "foundation/PxVec3.h"
+#include "PxgCommonDefines.h"
+#include "utils.cuh"
+
+template <typename T> __device__ static inline T ldS(const volatile T& val) { return val; }
+template <typename T> __device__ static inline void stS(volatile T& dst, const T& src) { dst = src; }
+
+template <> __device__ inline physx::PxVec3 ldS<physx::PxVec3>(const volatile physx::PxVec3& val) { return physx::PxVec3(val.x, val.y, val.z); }
+template <> __device__ inline void stS<physx::PxVec3>(volatile physx::PxVec3& dst, const physx::PxVec3& src) { dst.x = src.x, dst.y = src.y, dst.z = src.z; }
+
+template <> __device__ __forceinline__ physx::PxQuat ldS<physx::PxQuat>(const volatile physx::PxQuat& val) { return physx::PxQuat(val.x, val.y, val.z, val.w); }
+template <> __device__ __forceinline__ void stS<physx::PxQuat>(volatile physx::PxQuat& dst, const physx::PxQuat& src) { dst.x = src.x, dst.y = src.y, dst.z = src.z, dst.w = src.w; }
+
+template <> __device__ __forceinline__ physx::PxTransform ldS<physx::PxTransform>(const volatile physx::PxTransform& val) { return physx::PxTransform(ldS(val.p), ldS(val.q)); }
+template <> __device__ __forceinline__ void stS<physx::PxTransform>(volatile physx::PxTransform& dst, const physx::PxTransform& src)
+{ 
+	stS(dst.p, src.p); 
+	stS(dst.q, src.q); 
+}
+
+static __device__ __forceinline__ int next3(int x)
+{
+	return x==2 ? 0 : x+1;
+}
+
+static __device__ inline int prev3(int x)
+{
+	return x==0 ? 2 : x-1;
+}
+
+
+// operators dealing with 8 4-vectors in a warp
+static __device__ inline float dot(float a, float b)	// leaves answer in x
+{
+	float x = a*b;
+	return x + __shfl_down_sync(FULL_MASK, x,1) + __shfl_down_sync(FULL_MASK, x,2);
+}
+
+static __device__ inline float cross(float a, float b, int cp0, int cp1)
+{
+	return __shfl_sync(FULL_MASK, a, cp0) * __shfl_sync(FULL_MASK, b, cp1) - __shfl_sync(FULL_MASK, a, cp1) * __shfl_sync(FULL_MASK, b, cp0);
+}
+
+static __device__ inline float splatV(int index, float v)	// splat a vector across all 8 vectors
+{
+	return __shfl_sync(FULL_MASK, v, (index << 2) | threadIdx.x & 3);
+}
+
+static __device__ inline float splatX(float a)				// splat the x-component of each vector into yzw
+{
+	return __shfl_sync(FULL_MASK, a, threadIdx.x & 28);
+}
+	
+static __device__ inline float normalize(float x)			// normalize each vector
+{	
+	return x * splatX(rsqrt(dot(x,x)));		// TODO: precision
+}
+
+static __device__ inline float magnitude(float x)			// normalize each vector
+{	
+	return splatX(sqrt(dot(x,x)));		// TODO: precision
+}
+
+template<int N>
+static __device__ inline int packXFlags(bool a)				// pack a boolean from each x into a byte
+{
+	return __ballot_sync(FULL_MASK, __shfl_sync(FULL_MASK, a, threadIdx.x<<2)) & ((1<<N)-1);
+}
+
+template<int N>
+static __device__ inline bool allX(bool a)					// check some number initial set of the x-flags are all set
+{
+	return packXFlags<N>(a) == ((1<<N)-1);
+}
+
+template<int N>
+static __device__ inline bool anyX(bool a)					// check if any of an initial segment of x-flags are all set
+{
+	return packXFlags<N>(a) != 0;
+}
+
+static __device__ inline float loadV3Unsafe(const physx::PxVec3* v)	// NB requires padding (i.e. the 4th element is loaded)
+{
+	return reinterpret_cast<const float*>(v)[threadIdx.x&3];
+}
+
+static __device__ void storeV3(volatile physx::PxVec3* v, int index, float value)
+{
+	int lane = index<<2;
+	volatile float* dest = reinterpret_cast<volatile float*>(v);
+	if(threadIdx.x>=lane && threadIdx.x<lane+3)
+	{
+		dest[threadIdx.x&3] = value;
+	}
+}
+
+// experimentally, seems more register-efficient to coalesce this
+static __device__ __forceinline__
+physx::PxReal shuffleDot(const physx::PxU32 syncMask, const physx::PxVec3& v0, int shuffle0, const physx::PxVec3& v1)
+{
+	return __shfl_sync(syncMask, v0.x, shuffle0)*v1.x + __shfl_sync(syncMask, v0.y, shuffle0)*v1.y + __shfl_sync(syncMask, v0.z, shuffle0)*v1.z;
+}
+
+static __device__ __forceinline__
+physx::PxU32 maxIndex(physx::PxReal v, physx::PxU32 mask, physx::PxReal& maxV)
+{
+	maxV = mask & (1 << (threadIdx.x&31)) ? v : -FLT_MAX;
+
+	maxV = fmaxf(maxV, __shfl_xor_sync(FULL_MASK, maxV, 16));
+	maxV = fmaxf(maxV, __shfl_xor_sync(FULL_MASK, maxV, 8));
+	maxV = fmaxf(maxV, __shfl_xor_sync(FULL_MASK, maxV, 4));
+	maxV = fmaxf(maxV, __shfl_xor_sync(FULL_MASK, maxV, 2));
+	maxV = fmaxf(maxV, __shfl_xor_sync(FULL_MASK, maxV, 1));
+
+	return physx::lowestSetIndex(__ballot_sync(FULL_MASK, maxV == v)&mask);
+}
+
+static __device__ __forceinline__
+physx::PxU32 minIndex(physx::PxReal v, physx::PxU32 mask, physx::PxReal& minV)
+{
+	minV = mask & (1 << (threadIdx.x & 31)) ? v : FLT_MAX;
+
+	minV = fminf(minV, __shfl_xor_sync(FULL_MASK, minV, 16));
+	minV = fminf(minV, __shfl_xor_sync(FULL_MASK, minV, 8));
+	minV = fminf(minV, __shfl_xor_sync(FULL_MASK, minV, 4));
+	minV = fminf(minV, __shfl_xor_sync(FULL_MASK, minV, 2));
+	minV = fminf(minV, __shfl_xor_sync(FULL_MASK, minV, 1));
+
+	return physx::lowestSetIndex(__ballot_sync(FULL_MASK, minV == v)&mask);
+}
+
+// similar as above but only with blocks of 4 threads
+// mask must have exactly 4 consecutive bits set and the corresponding
+// threads must execute this function together
+static __device__ __forceinline__
+physx::PxU32 minIndex4(physx::PxReal v, physx::PxU32 mask, physx::PxU32 threadIndexInGroup)
+{
+	// sanity checks that function has been called as expected
+	assert(__popc(mask) == 4); // exactly 4 threads in mask
+	assert(mask & (1 << (threadIdx.x & 31))); // executing thread must be in the mask
+	assert(threadIndexInGroup == 0 || (mask & (1 << (threadIdx.x - 1 & 31)))); // threads consecutive
+
+	physx::PxReal minV =  v;
+
+	physx::PxReal minV_m1 = __shfl_sync(mask, minV, (threadIdx.x & 31) - 1);
+	if(threadIndexInGroup > 0)
+		minV = fminf(minV, minV_m1);
+
+	physx::PxReal minV_m2 = __shfl_sync(mask, minV, (threadIdx.x & 31) - 2);
+	if(threadIndexInGroup > 2)
+		minV = fminf(minV, minV_m2);
+
+	// Now the last thread (idx 3) of the group knows the min.
+	// Send it back to all the threads.
+	minV = __shfl_sync(mask, minV, (threadIdx.x & 31) | 3);
+
+	return physx::lowestSetIndex(__ballot_sync(mask, minV == v)&mask);
+}
+
+#endif
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/pairManagement.cu
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/pairManagement.cu
@@ -0,0 +1,190 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include <assert.h>
+#include "SparseRemove.cuh"
+
+#include "cudaNpCommon.h"
+#include "PxgPersistentContactManifold.h"
+#include "PxsContactManagerState.h"
+#include "PxgContactManager.h"
+#include "PxgNpKernelIndices.h"
+
+namespace physx
+{
+	class PxsContactManager;
+
+	extern "C" __host__ void initNarrowphaseKernels7() {}
+}
+
+extern "C" __global__ void removeContactManagers_Stage1(const PxgPairManagementData* PX_RESTRICT pairData)
+{
+	initializeKeepDropBuffer(pairData->mTempAccumulator, pairData->mNbPairs, pairData->mNbToRemove);
+}
+
+extern "C" __global__ void removeContactManagers_Stage2(const PxgPairManagementData* PX_RESTRICT pairData)
+{
+	markKeepDropBuff(pairData->mRemoveIndices, pairData->mNbToRemove, pairData->mTempAccumulator, pairData->mNbPairs);
+}
+
+extern "C" __global__ void removeContactManagers_Stage3(const PxgPairManagementData* PX_RESTRICT pairData)
+{
+	processKeepDropBuff<PxgNarrowPhaseBlockDims::REMOVE_CONTACT_MANAGERS, PxgNarrowPhaseGridDims::REMOVE_CONTACT_MANAGERS>
+		(pairData->mTempAccumulator, pairData->mNbPairs, pairData->mBlockSharedAccumulator);
+}
+
+extern "C" __global__ void removeContactManagers_Stage4(const PxgPairManagementData* PX_RESTRICT pairData)
+{
+	accumulateKeepDrop<PxgNarrowPhaseGridDims::REMOVE_CONTACT_MANAGERS>(pairData->mTempAccumulator, pairData->mNbPairs, pairData->mBlockSharedAccumulator);
+}
+
+extern "C" __global__ void removeContactManagers_Stage5(const PxgPairManagementData* PX_RESTRICT pairData, const bool copyManifold)
+{
+	const PxU32* PX_RESTRICT globalRunSumBuffer = pairData->mTempAccumulator;
+	const PxU32 totalSize = pairData->mNbPairs;
+	PxU32 nbToSwap = getNbSwapsRequired(globalRunSumBuffer, totalSize, pairData->mNbToRemove);
+	PxgContactManagerInput* inputData = pairData->mContactManagerInputData;
+	PxsContactManagerOutput* outputData = pairData->mContactManagerOutputData;
+	PxgPersistentContactManifold* manifolds = reinterpret_cast<PxgPersistentContactManifold*>(pairData->mPersistentContactManagers);
+	PxsContactManager** cms = pairData->mCpuContactManagerMapping;
+	Sc::ShapeInteraction** sis = pairData->mShapeInteractions;
+	PxReal* rest = pairData->mRestDistances;
+	PxsTorsionalFrictionData* torsional = pairData->mTorsionalData;
+
+	for(PxU32 i = (threadIdx.x + WARP_SIZE * threadIdx.y + blockIdx.x * blockDim.x * blockDim.y)/16;
+		i < nbToSwap; i+= ((blockDim.x * blockDim.y * gridDim.x)/16))
+	{
+		PxU32 srcIndex, dstIndex;
+		getSwapIndices(globalRunSumBuffer, totalSize, i, nbToSwap, dstIndex, srcIndex);
+
+		if((threadIdx.x&15) == 0)
+		{
+			inputData[dstIndex] = inputData[srcIndex];
+			outputData[dstIndex] = outputData[srcIndex];
+			cms[dstIndex] = cms[srcIndex];
+			sis[dstIndex] = sis[srcIndex];
+			rest[dstIndex] = rest[srcIndex];
+			torsional[dstIndex] = torsional[srcIndex];
+		}
+
+		if (copyManifold)
+		{
+			float4* dst = (float4*)&manifolds[dstIndex];
+			float4* src = (float4*)&manifolds[srcIndex];
+
+			const PxU32 nbFloat4 = sizeof(PxgPersistentContactManifold)/sizeof(float4);
+
+			for(PxU32 idx = threadIdx.x&15; idx < nbFloat4; idx += 16)			
+				dst[idx] = src[idx];
+		}
+	}
+}
+
+extern "C" __global__ void removeContactManagers_Stage5_CvxTri(const PxgPairManagementData* PX_RESTRICT pairData, const bool copyManifold)
+{
+	const PxU32* PX_RESTRICT globalRunSumBuffer = pairData->mTempAccumulator;
+	const PxU32 totalSize = pairData->mNbPairs;
+	PxU32 nbToSwap = getNbSwapsRequired(globalRunSumBuffer, totalSize, pairData->mNbToRemove);
+	PxgContactManagerInput* inputData = pairData->mContactManagerInputData;
+	PxsContactManagerOutput* outputData = pairData->mContactManagerOutputData;
+	PxgPersistentContactMultiManifold* manifolds = reinterpret_cast<PxgPersistentContactMultiManifold*>( pairData->mPersistentContactManagers);
+	PxsContactManager** cms = pairData->mCpuContactManagerMapping;
+	Sc::ShapeInteraction** sis = pairData->mShapeInteractions;
+	PxReal* rest = pairData->mRestDistances;
+	PxsTorsionalFrictionData* torsional = pairData->mTorsionalData;
+
+	for (PxU32 i = (threadIdx.x + WARP_SIZE * threadIdx.y + blockIdx.x * blockDim.x * blockDim.y)/WARP_SIZE;
+		i < nbToSwap; i += ((blockDim.x * blockDim.y * gridDim.x))/WARP_SIZE)
+	{
+		PxU32 srcIndex, dstIndex;
+		getSwapIndices(globalRunSumBuffer, totalSize, i, nbToSwap, dstIndex, srcIndex);
+
+		if(threadIdx.x == 0)
+		{
+			inputData[dstIndex] = inputData[srcIndex];
+			outputData[dstIndex] = outputData[srcIndex];
+			cms[dstIndex] = cms[srcIndex];
+			sis[dstIndex] = sis[srcIndex];
+			rest[dstIndex] = rest[srcIndex];
+			torsional[dstIndex] = torsional[srcIndex];
+		}
+
+		if (copyManifold)
+		{
+			float4* dst = (float4*)&manifolds[dstIndex];
+			float4* src = (float4*)&manifolds[srcIndex];
+
+			for (PxU32 i = threadIdx.x; i < sizeof(PxgPersistentContactMultiManifold) / sizeof(float4); i += WARP_SIZE)
+			{
+				dst[i] = src[i];
+			}
+		}
+	}
+}
+
+extern "C" __global__ void initializeManifolds(float4* destination, const float4* source, PxU32 dataSize, PxU32 nbTimesToReplicate)
+{
+	const PxU32 MaxStructureSize = 4096;
+
+	__shared__ float sourceData[MaxStructureSize / sizeof(float)];
+
+	//Number of threads required per element (rounded up to a warp size)
+	const PxU32 nbThreadsPerElement = dataSize / sizeof(float4);
+	//Number to process per block
+	const PxU32 nbPerBlock = blockDim.x / nbThreadsPerElement;
+	const PxU32 MaxThreadIdx = nbPerBlock*nbThreadsPerElement;
+
+	const PxU32 nbBlocksRequired = (nbTimesToReplicate + nbPerBlock - 1) / nbPerBlock;
+
+	if (threadIdx.x < nbThreadsPerElement)
+	{
+		const float4 value = source[threadIdx.x];
+		sourceData[threadIdx.x] = value.x;
+		sourceData[threadIdx.x + nbThreadsPerElement] = value.y;
+		sourceData[threadIdx.x + 2*nbThreadsPerElement] = value.z;
+		sourceData[threadIdx.x + 3*nbThreadsPerElement] = value.w;
+	}
+
+	const PxU32 threadReadIdx = threadIdx.x%nbThreadsPerElement;
+
+	__syncthreads();
+
+	for (PxU32 a = blockIdx.x; a < nbBlocksRequired; a += gridDim.x)
+	{
+		PxU32 startBlockIdx = nbPerBlock*a;
+		float4* startPtr = destination + startBlockIdx*nbThreadsPerElement;
+
+		PxU32 idx = startBlockIdx + threadIdx.x / nbThreadsPerElement;
+
+		if (threadIdx.x < MaxThreadIdx && idx < nbTimesToReplicate)
+		{
+			startPtr[threadIdx.x] = make_float4(sourceData[threadReadIdx], sourceData[threadReadIdx + nbThreadsPerElement],
+				sourceData[threadReadIdx + 2 * nbThreadsPerElement], sourceData[threadReadIdx + 3 * nbThreadsPerElement]);
+		}
+	}
+}
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/particleCollision.cuh
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/particleCollision.cuh
@@ -0,0 +1,225 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __PARTICLE_COLLISION_CUH__
+#define __PARTICLE_COLLISION_CUH__
+
+#include "foundation/PxVec3.h"
+#include "foundation/PxTransform.h"
+#include "sphereCollision.cuh"
+#include "deformableCollision.cuh"
+#include "GuConvexSupport.h"
+/**
+* Retuns distance and normal information for a particle-box contact.
+* If maxDist < PX_MAX_F32, then no result is provided if the distance is larger than maxDist.
+*/
+__device__ __forceinline__ static
+bool contactPointBox(PxVec3& normal, PxReal& dist,
+	const PxVec3& pointPos, const PxTransform& boxToWorld, const PxVec3& halfBoxExtents, const PxReal maxDist)
+{
+	PxVec3 relPointPos = boxToWorld.transformInv(pointPos);
+	
+	const PxVec3 closestPos(
+		PxClamp(relPointPos.x, -halfBoxExtents.x, halfBoxExtents.x),
+		PxClamp(relPointPos.y, -halfBoxExtents.y, halfBoxExtents.y),
+		PxClamp(relPointPos.z, -halfBoxExtents.z, halfBoxExtents.z)
+	);
+
+	if (maxDist < PX_MAX_F32)
+	{
+		PxReal maxDistSq = maxDist*maxDist;
+		PxReal distSq = (relPointPos - closestPos).magnitudeSquared();
+		if (distSq > maxDistSq)
+		{
+			return false;
+		}
+	}
+
+	//check whether the point is inside the aabb
+	const bool bX = halfBoxExtents.x >= PxAbs(relPointPos.x);
+	const bool bY = halfBoxExtents.y >= PxAbs(relPointPos.y);
+	const bool bZ = halfBoxExtents.z >= PxAbs(relPointPos.z);
+
+	const bool isInside = bX && bY && bZ;
+	if (isInside)
+	{
+		//dist from embedded point to box surface along 3 dimensions.
+		const PxVec3 distToSurface = halfBoxExtents - closestPos.abs();
+
+		//assume z is the smallest element of the distToSurface
+		normal = closestPos.z >= 0.f ? PxVec3(0.f, 0.f, 1.f) : PxVec3(0.f, 0.f, -1.f);
+		dist = -distToSurface.z;
+		//find smallest element of distToSurface
+		if (distToSurface.x <= distToSurface.y && distToSurface.x <= distToSurface.z)
+		{
+			normal = closestPos.x >= 0.f ? PxVec3(1.f, 0.f, 0.f) : PxVec3(-1.0f, 0.f, 0.f);
+			dist = -distToSurface.x;
+		}
+		else if (distToSurface.y <= distToSurface.z)
+		{
+			normal = closestPos.y >= 0.f ? PxVec3(0.f, 1.f, 0.f) : PxVec3(0.f, -1.f, 0.f);
+			dist = -distToSurface.y;
+		}
+	}
+	else
+	{
+		normal = relPointPos - closestPos;
+		dist = normal.normalizeFast();
+	}
+	normal = boxToWorld.rotate(normal);
+	return true;
+}
+
+/**
+* Retuns distance and normal information for a particle-sphere contact.
+* If maxDist < PX_MAX_F32, then no result is provided if the distance is larger than maxDist.
+*/
+__device__ __forceinline__ static
+bool contactPointSphere(PxVec3& normal, PxReal& dist,
+	const PxVec3& pointPos, const PxVec3& spherePos, const PxReal sphereRadius, const PxReal maxDist)
+{
+	PxVec3 relPointPos = pointPos - spherePos;
+	const PxReal distCenterSq = relPointPos.magnitudeSquared();
+	if (maxDist < PX_MAX_F32)
+	{
+		const PxReal maxDistCenter = maxDist + sphereRadius;
+		const PxReal maxDistCenterSq = maxDistCenter*maxDistCenter;
+		if (distCenterSq > maxDistCenterSq)
+		{
+			return false;
+		}
+	}
+	const PxReal distCenter = PxSqrt(distCenterSq);
+	normal = (distCenter > PX_EPS_REAL) ? relPointPos * (1.0f / distCenter) : PxVec3(1.0f, 0.0f, 0.0f);
+	dist = distCenter - sphereRadius;
+	return true;
+}
+
+/**
+* Retuns distance and normal information for a point-capsule contact.
+* If maxDist < PX_MAX_F32, then no result is provided if the distance is larger than maxDist.
+*/
+__device__ __forceinline__ static
+bool contactPointCapsule(PxVec3& normal, PxReal& dist, 
+	const PxVec3& pointPos, const PxVec3& capsulePos, const PxVec3& capsuleDir, const PxReal capsuleRadius, const PxReal capsuleHalfHeight, const PxReal maxDist)
+{
+	const PxVec3 capsuleAxisVec = capsuleDir * capsuleHalfHeight;
+	const PxVec3 relPointPos = pointPos - capsulePos;
+	PxReal t;
+	const PxReal distSegmentSq = distancePointSegmentSquared(-capsuleAxisVec, capsuleAxisVec, relPointPos, t);
+	if (maxDist < PX_MAX_F32)
+	{
+		const PxReal maxDistSegment = maxDist + capsuleRadius;
+		const PxReal maxDistSegmentSq = maxDistSegment * maxDistSegment;
+		if (distSegmentSq > maxDistSegmentSq)
+		{
+			return false;
+		}
+	}
+	const PxVec3 distSegmentVec = relPointPos - capsuleAxisVec * (2.0f * t - 1.0f);
+	const PxReal distSegment = PxSqrt(distSegmentSq);
+	normal = (distSegment > PX_EPS_REAL) ? (distSegmentVec * (1.0f / distSegment)) : PxVec3(1.0f, 0.0f, 0.0f);
+	dist = distSegment - capsuleRadius;
+	return true;
+}
+
+/**
+* Combined speculative particle triangle cd. Returns normal and distance.
+* If CCD is enabled two tests are performed. The first one decides if the triangle
+* is overlapping with the contact volume (sphere), which is typically located at a
+* predicted position of the particle, covering it's whole predicted path. The second 
+* test then establishes the correct distance and normal with respect to the actual particle
+* position.
+* If CCD is disabled, particlePos == cVolumePos, and cVolumeRadius is the standard contact 
+* distance.
+*/
+__device__ __forceinline__ static
+bool particleTriangleTest(PxVec3& normal, PxReal& distance,
+	const PxVec3& particlePos, const PxVec3& cVolumePos, const PxReal cVolumeRadius,
+	const PxVec3& triV0, const PxVec3& triV1, const PxVec3& triV2, const bool enableCCD)
+{
+	const PxVec3 triNormal = ((triV1 - triV0).cross(triV2 - triV0)).getNormalized();
+	const PxReal cVolumeRadiusSq = cVolumeRadius * cVolumeRadius;
+
+	//contact volume test
+	const PxReal cVolumePosPlaneDist = (cVolumePos - triV0).dot(triNormal);
+	PxVec3 closestPos;
+	if (cVolumePosPlaneDist < cVolumeRadius)
+	{
+		PxReal distSq = distancePointTriangleSquared(cVolumePos, triV0, triV1, triV2, closestPos);
+		if (distSq <= cVolumeRadiusSq)
+		{
+			//intersection test
+			//back face culling with respect to current particle position
+			const PxReal particlePosPlaneDist = (particlePos - triV0).dot(triNormal);
+			if (particlePosPlaneDist > 0.0f)
+			{
+				if (enableCCD)
+				{
+					//in case of CCD we replace the closest and distance from the contact volume with
+					//the corresponding from the current particle position 
+					distSq = distancePointTriangleSquared(particlePos, triV0, triV1, triV2, closestPos);
+				}
+				//assumes particlePos == cVolumePos if enableCCD == false.
+				PxVec3 distVec = particlePos - closestPos;
+				normal = (distSq > 0.0f) ? distVec*PxRecipSqrt(distSq) : triNormal;
+				distance = PxSqrt(distSq);
+				return true;
+			}
+		}
+	}
+	return false;
+}
+
+/**
+* Retuns distance and normal information for a particle-convexCore contact.
+* If maxDist < PX_MAX_F32, then no result is provided if the distance is larger than maxDist.
+*/
+__device__ __forceinline__ static
+bool particleConvexCore(PxVec3& normal, PxReal& dist, const PxVec3& particlePos,
+	const Gu::ConvexShape& convex, const PxReal maxDist)
+{
+	Gu::ConvexShape particle;
+	particle.coreType = Gu::ConvexCore::Type::ePOINT;
+	particle.pose = PxTransform(particlePos);
+	particle.margin = 0;
+
+	PxVec3 point0, point1;
+	dist = Gu::RefGjkEpa::computeGjkDistance(particle, convex, particle.pose, convex.pose,
+		convex.margin + particle.margin + maxDist, point0, point1, normal);
+
+	if (dist < FLT_EPSILON)
+		dist = Gu::RefGjkEpa::computeEpaDepth(particle, convex, particle.pose, convex.pose,
+			point0, point1, normal);
+
+	if (dist > maxDist)
+		return false;
+
+	return true;
+}
+#endif // __PARTICLE_COLLISION_CUH__
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/particleSystemHFMidPhaseCG.cu
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/particleSystemHFMidPhaseCG.cu
@@ -0,0 +1,359 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. 
+
+#include "foundation/PxBounds3.h"
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxTransform.h"
+#include "foundation/PxVec3.h"
+
+#include "geometry/PxHeightFieldSample.h"
+
+#include "PxNodeIndex.h"
+
+#include "PxgContactManager.h"
+#include "PxgConvexConvexShape.h"
+#include "PxgParticleSystem.h"
+#include "PxgParticleSystemCoreKernelIndices.h"
+
+#include "PxsTransformCache.h"
+
+#include <vector_types.h>
+
+#include "PxgCommonDefines.h"
+#include "heightfieldUtil.cuh"
+#include "dataReadWriteHelper.cuh"
+#include "gridCal.cuh"
+#include "particleCollision.cuh"
+
+#include "assert.h"
+
+using namespace physx;
+
+extern "C" __host__ void initNarrowphaseKernels12() {}
+
+PX_ALIGN_PREFIX(16)
+struct PsHeightfieldScratch
+{
+	PxTransform heightfieldTransform;
+	PxgShape heightfieldShape;
+
+	PxgParticleContactInfo* PX_RESTRICT oneWayContactInfos;
+	PxNodeIndex* PX_RESTRICT oneWayContactNodeIndices;
+	PxU32* PX_RESTRICT oneWayContactCounts;
+
+	float4* currentPositions;
+	float4* predictedPositions;
+	PxNodeIndex rigidId;		//heightfield
+	PxU32 transformCacheRef;	//heightfield
+	PxU32 numParticles;
+	PxU32 particleSystemId;
+	PxReal contactDist;
+	PxReal restDist;
+	bool enableCCD;
+}PX_ALIGN_SUFFIX(16);
+
+
+__device__ static inline void psHeightfieldMidphaseCollision(
+	PsHeightfieldScratch* sScratch,
+	float4* PX_RESTRICT sPerParticleTempContacts,
+	const PxU32 numParticles,
+	const PxU32 particleIndex,
+	const bool isDiffuse,
+	const bool isTGS,
+	PxgParticleContactWriter& writer
+)
+{
+	const PxReal contactDist = sScratch->contactDist;
+	const PxReal restDist = sScratch->restDist;
+	
+	PxVec3 currentPos = PxLoad3(sScratch->currentPositions[particleIndex]);
+	PxVec3 cVolumePos;
+	PxReal cVolumeRadius;
+	if (sScratch->enableCCD)
+	{
+		PxVec3 predictedPos = PxLoad3(sScratch->predictedPositions[particleIndex]);
+		cVolumeRadius = getParticleSpeculativeContactVolume(cVolumePos,
+			currentPos, predictedPos, contactDist, isDiffuse, isTGS);
+	}
+	else
+	{
+		cVolumePos = currentPos;
+		cVolumeRadius = contactDist;
+	}
+
+	const PxVec3 cVolumeRadius3(cVolumeRadius);
+	const PxVec3 min = cVolumePos - cVolumeRadius3;
+	const PxVec3 max = cVolumePos + cVolumeRadius3;
+	PxBounds3 worldBound(min, max);
+
+	PxTransform heightfieldTransform = sScratch->heightfieldTransform;
+
+	PxgShape heightfieldShape = sScratch->heightfieldShape;
+	const PxReal oneOverHeightScale = 1.f / heightfieldShape.scale.scale.y;
+	const PxReal oneOverRowScale = 1.f / PxAbs(heightfieldShape.scale.scale.x);
+	const PxReal oneOverlColScale = 1.f / PxAbs(heightfieldShape.scale.scale.z);
+
+	//bound is in world space, we need to transform the bound to the local space of height field
+	PxBounds3 localBound = PxBounds3::transformFast(heightfieldTransform.getInverse(), worldBound);
+
+	localBound.minimum.x *= oneOverRowScale;
+	localBound.minimum.y *= oneOverHeightScale;
+	localBound.minimum.z *= oneOverlColScale;
+
+	localBound.maximum.x *= oneOverRowScale;
+	localBound.maximum.y *= oneOverHeightScale;
+	localBound.maximum.z *= oneOverlColScale;
+
+	//row scale
+	if (heightfieldShape.scale.scale.x < 0.f)
+	{
+		//swap min and max row scale
+		const PxReal temp = localBound.minimum.x;
+		localBound.minimum.x = localBound.maximum.x;
+		localBound.maximum.x = temp;
+	}
+
+	//col scale
+	if (heightfieldShape.scale.scale.z < 0.f)
+	{
+		PxReal swap = localBound.minimum.z;
+		localBound.minimum.z = localBound.maximum.z;
+		localBound.maximum.z = swap;
+	}
+
+	PxU32* heightfieldData = reinterpret_cast<PxU32*>(heightfieldShape.hullOrMeshPtr);
+	const PxU32 nbRows = heightfieldData[0];
+	const PxU32 nbCols = heightfieldData[1];
+	PxHeightFieldSample* samples = reinterpret_cast<PxHeightFieldSample*>(&heightfieldData[2]);
+
+	if ((localBound.minimum.x > nbRows - 1) || (localBound.minimum.z > nbCols - 1)
+		|| (localBound.maximum.x < 0) || (localBound.maximum.z < 0))
+	{
+		return;
+	}
+
+	PxU32 minRow = getMinRow(localBound.minimum.x, nbRows);
+	PxU32 maxRow = getMaxRow(localBound.maximum.x, nbRows);
+	PxU32 minColumn = getMinColumn(localBound.minimum.z, nbCols);
+	PxU32 maxColumn = getMaxColumn(localBound.maximum.z, nbCols);
+
+	if ((2 * (maxColumn - minColumn) * (maxRow - minRow)) == 0)
+	{
+		return;
+	}
+
+	const PxVec3 cVolumePosShapeSpace = heightfieldTransform.transformInv(cVolumePos);
+	const PxVec3 particlePosShapeSpace = heightfieldTransform.transformInv(currentPos);
+
+	const PxReal miny = localBound.minimum.y;
+	const PxReal maxy = localBound.maximum.y;
+
+	const PxU32 columnSpan = maxColumn - minColumn;
+
+	//How many static contacts this particle already has
+	PxU32 contactCounts = 0;
+
+	//we have two materials corresponding to one vertexIndex, so each thread will deal with one of the materials
+	const PxU32 totalNumProcessed = (maxRow - minRow) * columnSpan * 2;
+
+	for (PxU32 i = 0; i < totalNumProcessed; ++i)
+	{
+		PxU32 triangleIdx = 0xFFffFFff;
+
+		const PxU32 index = i / 2;
+		const PxU32 vertexIndex = (minRow + index / columnSpan) * nbCols + (minColumn + index % columnSpan);
+		assert(isValidVertex(vertexIndex, nbRows, nbCols));
+		PxReal h0 = getHeight(vertexIndex, samples);
+		PxReal h1 = getHeight(vertexIndex + 1, samples);
+		PxReal h2 = getHeight(vertexIndex + nbCols, samples);
+		PxReal h3 = getHeight(vertexIndex + nbCols + 1, samples);
+		const bool con0 = maxy < h0 && maxy < h1 && maxy < h2 && maxy < h3;
+		const bool con1 = miny > h0 && miny > h1 && miny > h2 && miny > h3;
+
+		if (!(con0 || con1))
+		{
+			const PxHeightFieldSample& sample = getSample(vertexIndex, samples);
+
+			const bool isMaterial1 = (i & 1) ? 1 : 0;
+			PxU32 material = isMaterial1 ? sample.materialIndex1 : sample.materialIndex0;
+			if (material != PxHeightFieldMaterial::eHOLE)
+			{
+				triangleIdx = isMaterial1 ? ((vertexIndex << 1) + 1) : (vertexIndex << 1);
+
+				PxVec3 triV0, triV1, triV2;
+				getTriangle(triV0, triV1, triV2, NULL, triangleIdx, heightfieldShape.scale, nbRows, nbCols, samples);
+				
+				PxVec3 normalShapeSpace;
+				PxReal distance;
+				bool intersect = particleTriangleTest(normalShapeSpace, distance,
+					particlePosShapeSpace, cVolumePosShapeSpace, cVolumeRadius, triV0, triV1, triV2, sScratch->enableCCD);
+				
+				//this code still misses a logic from the triangle mesh near phase, which also selects the best contact
+				//if there are more than MaxStaticContactsPerMesh
+				if (intersect && contactCounts < PxgParticleContactInfo::MaxStaticContactsPerMesh)
+				{
+					PxVec3 normal = heightfieldTransform.rotate(normalShapeSpace);
+					sPerParticleTempContacts[contactCounts++] = make_float4(normal.x, normal.y, normal.z, distance - restDist);
+				}
+			}
+		}
+	}//end of totalNumProcessed
+
+	contactCounts = PxMin(contactCounts, PxgParticleContactInfo::MaxStaticContactsPerMesh);
+	PxU32 contactWritten = 0;
+	if (sScratch->rigidId.isStaticBody())
+	{
+		if (contactCounts)
+		{
+			PxU32 contactIndex = (PxU32)(atomicAdd((PxI32*)&sScratch->oneWayContactCounts[particleIndex], (PxI32)contactCounts));
+			PxU32 endIndex = PxMin(contactIndex + contactCounts, PxgParticleContactInfo::MaxStaticContactsPerParticle);
+			for (PxU32 i = contactIndex, src = 0; i < endIndex; i++, src++)
+			{
+				const PxU32 outputIndex = particleIndex + i * numParticles;
+				sScratch->oneWayContactInfos[outputIndex].mNormal_PenW = sPerParticleTempContacts[src];
+				sScratch->oneWayContactNodeIndices[outputIndex] = sScratch->rigidId;
+			}
+		}
+		contactWritten = PxMin(contactCounts, PxgParticleContactInfo::MaxStaticContactsPerParticle);
+		sScratch->oneWayContactCounts[particleIndex] = contactWritten;
+
+	}
+
+	PxU32 remaining = contactCounts - contactWritten;
+	if ((remaining > 0) && !isDiffuse)
+	{
+		PxU64 compressedParticleIndex = PxEncodeParticleIndex(sScratch->particleSystemId, particleIndex);
+		PxU32 contactStartIndex = atomicAdd(writer.numTotalContacts, remaining);
+		PxU32 contactEndIndex = contactStartIndex + remaining;
+		for (PxU32 contactIndex = contactStartIndex, src = 0; contactIndex < contactEndIndex; contactIndex++, src++)
+		{
+			float4 contact = sPerParticleTempContacts[src];
+			writer.writeContact(contactIndex, PxVec4(contact.x, contact.y, contact.z, contact.w), compressedParticleIndex, sScratch->rigidId);
+		}
+	}
+}
+
+
+extern "C" __global__ void ps_heightfieldCollisonLaunch(
+	const bool isTGS,
+	const PxReal toleranceLength,
+	const PxgContactManagerInput* PX_RESTRICT cmInputs,
+	const PxsCachedTransform* PX_RESTRICT transformCache,
+	const PxReal* PX_RESTRICT contactDistance,
+	const PxReal* PX_RESTRICT restDistances,
+	const PxgShape* PX_RESTRICT gpuShapes,
+	PxgParticleSystem* PX_RESTRICT particleSystems,
+	PxNodeIndex* PX_RESTRICT shapeToRigidRemapTable,
+	PxgParticleContactWriter writer
+)
+{
+	__shared__ __align__(16) PxU8 sData[sizeof(PsHeightfieldScratch)];
+	PsHeightfieldScratch& sHeightfieldScratch = reinterpret_cast<PsHeightfieldScratch&>(sData);
+
+	__shared__ float4 sTempContacts[PxgParticleSystemKernelBlockDim::PS_HEIGHTFIELD_COLLISION][PxgParticleContactInfo::MaxStaticContactsPerMesh];
+
+	unsigned int cmIdx = blockIdx.y;
+
+	const PxU32 warpIndex = threadIdx.x / WARP_SIZE;
+
+	const bool isDiffuseParticlesThread = blockIdx.z == 1;
+	
+	if (warpIndex == 0)
+	{
+		const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE - 1);
+
+		PxgShape particleShape, heightfieldShape;
+		PxU32 particleCacheRef, heightfieldCacheRef;
+		LoadShapePairWarp<PxGeometryType::ePARTICLESYSTEM, PxGeometryType::eHEIGHTFIELD>(cmInputs, cmIdx, gpuShapes,
+			particleShape, particleCacheRef, heightfieldShape, heightfieldCacheRef);
+
+		PxsCachedTransform heightfieldTransformCache;
+		PxsCachedTransform_ReadWarp(heightfieldTransformCache, transformCache + heightfieldCacheRef);
+
+		const PxReal contactDist = contactDistance[particleCacheRef] + contactDistance[heightfieldCacheRef];
+
+		const PxNodeIndex rigidId = shapeToRigidRemapTable[heightfieldCacheRef];
+
+		const PxU32 particleSystemId = particleShape.particleOrSoftbodyId;
+		PxgParticleSystem& particleSystem = particleSystems[particleSystemId];
+
+		if (threadIndexInWarp == 0)
+		{
+			sHeightfieldScratch.heightfieldTransform = heightfieldTransformCache.transform;
+			sHeightfieldScratch.heightfieldShape = heightfieldShape;
+			sHeightfieldScratch.rigidId = rigidId;
+			sHeightfieldScratch.transformCacheRef = heightfieldCacheRef;
+			sHeightfieldScratch.contactDist = contactDist;
+			sHeightfieldScratch.restDist = restDistances[cmIdx];
+			sHeightfieldScratch.enableCCD = (particleSystem.mData.mFlags & PxParticleFlag::eENABLE_SPECULATIVE_CCD) > 0;
+
+			if (isDiffuseParticlesThread)
+			{
+				// If there is no diffuse particles or is not using PBD, disactivate collision handling from this kernel
+				const bool isDiffuseParticlesActive = (particleSystem.mCommonData.mMaxDiffuseParticles > 0);
+				sHeightfieldScratch.currentPositions = particleSystem.mDiffuseSortedOriginPos_LifeTime;
+				sHeightfieldScratch.predictedPositions = particleSystem.mDiffuseSortedPos_LifeTime;
+				sHeightfieldScratch.numParticles = (isDiffuseParticlesActive) ? *particleSystem.mNumDiffuseParticles : 0;
+				sHeightfieldScratch.oneWayContactInfos = particleSystem.mDiffuseOneWayContactInfos;
+				sHeightfieldScratch.oneWayContactNodeIndices = particleSystem.mDiffuseOneWayNodeIndex;
+				sHeightfieldScratch.oneWayContactCounts = particleSystem.mDiffuseOneWayContactCount;
+				sHeightfieldScratch.particleSystemId = particleSystemId;
+			}
+			else
+			{
+				sHeightfieldScratch.currentPositions = particleSystem.mSortedOriginPos_InvMass;
+				sHeightfieldScratch.predictedPositions = particleSystem.mSortedPositions_InvMass;
+				sHeightfieldScratch.numParticles = particleSystem.mCommonData.mNumParticles;
+				sHeightfieldScratch.oneWayContactInfos = particleSystem.mOneWayContactInfos;
+				sHeightfieldScratch.oneWayContactNodeIndices = particleSystem.mOneWayNodeIndex;
+				sHeightfieldScratch.oneWayContactCounts = particleSystem.mOneWayContactCount;
+				sHeightfieldScratch.particleSystemId = particleSystemId;
+			}
+		}
+	}
+
+	__syncthreads();
+	
+	const PxU32 numParticles = sHeightfieldScratch.numParticles;
+	
+	if (isDiffuseParticlesThread && numParticles == 0)
+		return;
+
+	for (PxU32 globalThreadIndex = blockIdx.x * blockDim.x + threadIdx.x; globalThreadIndex < numParticles; globalThreadIndex += gridDim.x * blockDim.x)
+	{
+		psHeightfieldMidphaseCollision(
+			&sHeightfieldScratch,
+			sTempContacts[threadIdx.x],
+			numParticles,
+			globalThreadIndex,
+			isDiffuseParticlesThread,
+			isTGS,
+			writer
+			);
+	}
+}
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/particleSystemMeshMidphase.cu
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/particleSystemMeshMidphase.cu
@@ -0,0 +1,805 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. 
+
+#include <stdio.h>
+#include <stdint.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "GuIntersectionTriangleBoxRef.h"
+
+#include "foundation/PxVec3.h"
+#include "foundation/PxQuat.h"
+#include "foundation/PxTransform.h"
+#include "geometry/PxMeshScale.h"
+#include "geometry/PxGeometry.h"
+
+#include "cudaNpCommon.h"
+#include "PxgPersistentContactManifold.h"
+#include "PxgContactManager.h"
+#include "PxgConvexConvexShape.h"
+#include "PxsContactManagerState.h"
+#include "PxsTransformCache.h"
+#include "PxgParticleSystem.h"
+#include "PxgParticleSystemCore.h"
+#include "PxgParticleSystemCoreKernelIndices.h"
+#include "PxgNpKernelIndices.h"
+#include "PxsMaterialCore.h"
+
+#include "PxgCommonDefines.h"
+#include "utils.cuh"
+#include "deformableElementFilter.cuh"
+#include "PxNodeIndex.h"
+#include "sdfCollision.cuh"
+#include "gridCal.cuh"
+
+using namespace physx;
+
+extern "C" __host__ void initNarrowphaseKernels11() {}
+
+#include "warpHelpers.cuh"
+#include "manifold.cuh"
+#include "vector_functions.h"
+
+#include "reduction.cuh"
+#include "PxgSimulationCoreDesc.h"
+
+#include "bv32Traversal.cuh"
+#include "triangleMesh.cuh"
+#include "particleCollision.cuh"
+
+
+PX_ALIGN_PREFIX(16)
+struct PsMidphaseScratch
+{
+	const float4* PX_RESTRICT trimeshVerts;
+	const uint4* PX_RESTRICT trimeshTriIndices;
+
+	PxTransform meshToWorld;
+	PxMeshScale trimeshScale;
+	
+	//bv32 tree
+	const Gu::BV32DataPacked* bv32PackedNodes;
+
+	//stack for traversal
+	int sBv32Nodes[192]; //6 depth of the bv32 tree
+	PxU32 sBv32ActiveParticles[192];
+	PxVec3 shLocalContacts[32][PxgParticleContactInfo::MaxStaticContactsPerMesh];
+	PxReal shLocalContactDepths[32][PxgParticleContactInfo::MaxStaticContactsPerMesh];
+
+	float4* currentPositions;
+	float4* predictedPositions;
+	bool enableCCD;
+	PxU8 pad[7];
+}PX_ALIGN_SUFFIX(16);
+
+
+struct PsTreeContactGenTraverser
+{
+	PsMidphaseScratch* s_warpScratch;
+	const PxU32 globalParticleIndex;
+	const PxU32 numParticles;
+	PxVec3 cVolumePosVertexSpace;
+	PxReal cVolumeRadius;
+	PxVec3 cVolumePosShapeSpace;
+	PxVec3 particlePosShapeSpace;
+	PxU32 currentNodeActiveParticles;
+	PxU32 contactCount;
+
+	PX_FORCE_INLINE __device__ PsTreeContactGenTraverser(
+		PsMidphaseScratch* s_warpScratch,
+		const PxU32 numParticles,
+		const PxU32 globalParticleIndex,
+		const bool isDiffuse,
+		const bool isTGS,
+		const PxReal contactDist
+		)
+	: s_warpScratch(s_warpScratch)
+	, numParticles(numParticles)
+	, globalParticleIndex(globalParticleIndex)
+	, contactCount(0)
+	{
+		if (globalParticleIndex < numParticles)
+		{
+			const PxMeshScale trimeshScale = s_warpScratch->trimeshScale;
+			const PxTransform meshToWorld = s_warpScratch->meshToWorld;
+			const PxVec3 currentPos = PxLoad3(s_warpScratch->currentPositions[globalParticleIndex]);
+			particlePosShapeSpace = meshToWorld.transformInv(currentPos);
+			
+			if (s_warpScratch->enableCCD)
+			{
+				const PxVec3 predictedPos = PxLoad3(s_warpScratch->predictedPositions[globalParticleIndex]);
+				PxVec3 cVolumePos;
+				cVolumeRadius = getParticleSpeculativeContactVolume(cVolumePos, currentPos, predictedPos, contactDist, isDiffuse, isTGS);
+				cVolumePosShapeSpace = meshToWorld.transformInv(cVolumePos);
+			}
+			else
+			{
+				cVolumePosShapeSpace = particlePosShapeSpace;
+				cVolumeRadius = contactDist;
+			}
+			cVolumePosVertexSpace = shape2Vertex(cVolumePosShapeSpace, trimeshScale.scale, trimeshScale.rotation);
+		}
+
+		//intiailize all valid particles as active particle
+		currentNodeActiveParticles = __ballot_sync(FULL_MASK, globalParticleIndex < numParticles);
+	}
+
+	PX_FORCE_INLINE __device__ bool intersectBoxFullWarp(bool hasBox, const PxVec3& cMin, const PxVec3& cMax) const
+	{
+		const PxMeshScale trimeshScale = s_warpScratch->trimeshScale;
+
+		PxVec3 sCVolumePosVertexSpace;
+		PxReal sCVolumeRadius;
+
+		PxU32 newActiveParticles = 0;
+		for (PxU32 i = currentNodeActiveParticles; i; i = clearLowestSetBit(i))
+		{
+			const PxU32 pIndexInWarp = (i == 0) ? 0 : lowestSetIndex(i);
+
+			sCVolumePosVertexSpace.x = __shfl_sync(FULL_MASK, cVolumePosVertexSpace.x, pIndexInWarp);
+			sCVolumePosVertexSpace.y = __shfl_sync(FULL_MASK, cVolumePosVertexSpace.y, pIndexInWarp);
+			sCVolumePosVertexSpace.z = __shfl_sync(FULL_MASK, cVolumePosVertexSpace.z, pIndexInWarp);
+			sCVolumeRadius = __shfl_sync(FULL_MASK, cVolumeRadius, pIndexInWarp);
+
+			if (hasBox)
+			{
+				const PxVec3 closestVertexSpace = cMin.maximum(cMax.minimum(sCVolumePosVertexSpace));
+				const PxVec3 distVecVertexSpace = closestVertexSpace - sCVolumePosVertexSpace;
+				const PxVec3 distVec = vertex2Shape(distVecVertexSpace, trimeshScale.scale, trimeshScale.rotation);
+				const PxReal distSq = distVec.magnitudeSquared();
+
+				if (distSq < sCVolumeRadius*sCVolumeRadius)
+				{
+					newActiveParticles |= 1 << pIndexInWarp;
+				}
+			}
+		}
+		return newActiveParticles != 0;
+	}
+
+	PX_FORCE_INLINE __device__ void intersectPrimitiveFullWarp(PxU32 primitiveIndex, PxU32 idxInWarp)
+	{
+		const PxMeshScale trimeshScale = s_warpScratch->trimeshScale;
+		const PxTransform meshToWorld = s_warpScratch->meshToWorld;
+
+		PxVec3 sParticlePosShapeSpace;
+		PxVec3 sCVolumePosShapeSpace;
+		PxReal sCVolumeRadius;
+
+		for (PxU32 i = currentNodeActiveParticles; i; i = clearLowestSetBit(i))
+		{
+			const PxU32 pIndexInWarp = (i == 0) ? 0 : lowestSetIndex(i);
+
+			sParticlePosShapeSpace.x = __shfl_sync(FULL_MASK, particlePosShapeSpace.x, pIndexInWarp);
+			sParticlePosShapeSpace.y = __shfl_sync(FULL_MASK, particlePosShapeSpace.y, pIndexInWarp);
+			sParticlePosShapeSpace.z = __shfl_sync(FULL_MASK, particlePosShapeSpace.z, pIndexInWarp);
+
+			sCVolumePosShapeSpace.x = __shfl_sync(FULL_MASK, cVolumePosShapeSpace.x, pIndexInWarp);
+			sCVolumePosShapeSpace.y = __shfl_sync(FULL_MASK, cVolumePosShapeSpace.y, pIndexInWarp);
+			sCVolumePosShapeSpace.z = __shfl_sync(FULL_MASK, cVolumePosShapeSpace.z, pIndexInWarp);
+			sCVolumeRadius = __shfl_sync(FULL_MASK, cVolumeRadius, pIndexInWarp);
+
+			bool intersect = false;
+			PxVec3 normal(0.f);
+			PxReal distance = PX_MAX_F32;
+			PxU32 triangleIdx = 0xffffffff;
+
+			if (primitiveIndex != 0xFFFFFFFF)
+			{
+				triangleIdx = primitiveIndex;
+
+				uint4 triIdx = s_warpScratch->trimeshTriIndices[triangleIdx];
+
+				PxVec3 triV0, triV1, triV2;
+				triV0 = vertex2Shape(PxLoad3(s_warpScratch->trimeshVerts[triIdx.x]), trimeshScale.scale, trimeshScale.rotation);
+				triV1 = vertex2Shape(PxLoad3(s_warpScratch->trimeshVerts[triIdx.y]), trimeshScale.scale, trimeshScale.rotation);
+				triV2 = vertex2Shape(PxLoad3(s_warpScratch->trimeshVerts[triIdx.z]), trimeshScale.scale, trimeshScale.rotation);
+
+				MeshScaling meshScale(s_warpScratch->trimeshScale.scale, s_warpScratch->trimeshScale.rotation);
+
+				if (meshScale.flipNormal)
+					PxSwap(triV1, triV2);
+
+				PxVec3 normalShapeSpace;
+				intersect = particleTriangleTest(normalShapeSpace, distance,
+					sParticlePosShapeSpace, sCVolumePosShapeSpace, sCVolumeRadius, triV0, triV1, triV2, s_warpScratch->enableCCD);
+
+				if (intersect)
+				{
+					normal = meshToWorld.rotate(normalShapeSpace);
+				}
+			}
+
+			//push the triangle into stack ptr
+			PxU32 resultWarp = __ballot_sync(FULL_MASK, intersect);
+			PxU32 offset = warpScanExclusive(resultWarp, idxInWarp);
+			PxU32 validCount = __popc(resultWarp);
+
+			//However many contacts the particle already had...
+			PxU32 contactStartIndex = __shfl_sync(FULL_MASK, contactCount, pIndexInWarp);
+
+			if (idxInWarp == pIndexInWarp)
+				contactCount += validCount;
+
+			bool failedWrite = false;
+			if (intersect)
+			{
+				const PxU32 index = contactStartIndex + offset;
+
+				if (index < PxgParticleContactInfo::MaxStaticContactsPerMesh)
+				{
+					//If we have space for this contact (otherwise, we drop it!)
+					s_warpScratch->shLocalContacts[pIndexInWarp][index] = normal;
+					s_warpScratch->shLocalContactDepths[pIndexInWarp][index] = distance;
+				}
+				else
+					failedWrite = true;
+			}
+
+			PxU32 failMask = __ballot_sync(FULL_MASK, failedWrite);
+
+			if (failMask)
+			{
+				PxReal outSep = -PX_MAX_F32;
+				if (idxInWarp < PxgParticleContactInfo::MaxStaticContactsPerMesh)
+					outSep = s_warpScratch->shLocalContactDepths[pIndexInWarp][idxInWarp];
+
+				for (PxU32 i = failMask; i != 0; i = clearLowestSetBit(i))
+				{
+					PxU32 index = lowestSetIndex(i);
+					PxReal depth = __shfl_sync(FULL_MASK, distance, index);
+
+					PxU32 candidateSwapMask = __ballot_sync(FULL_MASK, depth < outSep);
+					if (candidateSwapMask)
+					{
+						PxReal bestVal;
+						PxU32 bestIndex = maxIndex(outSep, FULL_MASK, bestVal);
+
+						if (idxInWarp == index)
+						{
+							s_warpScratch->shLocalContacts[pIndexInWarp][bestIndex] = normal;
+							s_warpScratch->shLocalContactDepths[pIndexInWarp][bestIndex] = distance;
+						}
+						if (idxInWarp == bestIndex)
+							outSep = depth;
+					}
+
+				}
+
+				if (idxInWarp == pIndexInWarp)
+					contactCount = PxgParticleContactInfo::MaxStaticContactsPerMesh;
+			}
+		}
+	}
+
+	PX_FORCE_INLINE __device__ void finalizeFullWarp(
+		PxU32 idxInWarp,
+		const PxU32 particleSystemId,
+		const PxNodeIndex rigidId,
+		PxgParticleContactInfo* PX_RESTRICT oneWayContactInfos,		//output
+		PxNodeIndex* PX_RESTRICT oneWayContactNodeIndices,			//output
+		PxU32* PX_RESTRICT oneWayContactCounts,						//output
+		PxgParticleContactWriter& writer,		
+		const bool isDiffuseParticlesThread,
+		const PxReal restDist
+		)
+	{
+		//__syncwarp();
+
+		PxU32 id = 0;
+		if (globalParticleIndex < numParticles)
+		{
+			if (rigidId.isStaticBody())
+			{
+				if (contactCount != 0)
+				{
+					PxU32 startIndex = PxU32(atomicAdd((PxI32*)&oneWayContactCounts[globalParticleIndex], PxI32(contactCount)));
+
+					PxU32 endIndex = PxMin(PxgParticleContactInfo::MaxStaticContactsPerParticle, startIndex + contactCount);
+
+					for (PxU32 i = startIndex; i < endIndex; i++, id++)
+					{
+						const PxVec3& normal = s_warpScratch->shLocalContacts[threadIdx.x][id];
+						PxReal penetration = s_warpScratch->shLocalContactDepths[threadIdx.x][id] - restDist;
+						const PxU32 outputIndex = globalParticleIndex + i * numParticles;
+						oneWayContactInfos[outputIndex].mNormal_PenW = make_float4(normal.x, normal.y, normal.z, penetration);
+						oneWayContactNodeIndices[outputIndex] = rigidId;
+					}
+				}
+			}
+		}
+
+		PxU32 remaining = contactCount - id;
+
+		if (__any_sync(FULL_MASK, remaining))
+		{
+
+			PxU32 count = warpScan<AddOpPxU32, PxU32>(FULL_MASK, remaining);
+
+			PxU32 startIndex = 0xFFFFFFFF;
+			if (idxInWarp == 31)
+			{
+				startIndex = atomicAdd(writer.numTotalContacts, count);
+			}
+
+			startIndex = __shfl_sync(FULL_MASK, startIndex, 31);
+
+
+			if (globalParticleIndex < numParticles)
+			{
+				startIndex = startIndex + count - remaining;
+
+				if (remaining && !isDiffuseParticlesThread)
+				{
+					PxU64 compressedParticleIndex = PxEncodeParticleIndex(particleSystemId, globalParticleIndex);
+
+					for (PxU32 i = id, index = startIndex; i < contactCount; ++i, ++index)
+					{
+						const PxVec3& normal = s_warpScratch->shLocalContacts[idxInWarp][i];
+						PxReal penetration = s_warpScratch->shLocalContactDepths[idxInWarp][i] - restDist;
+						writer.writeContact(index, PxVec4(normal, penetration), compressedParticleIndex, rigidId);
+					}
+				}
+			}
+		}
+	}
+};
+
+
+
+__device__ static inline void psSdfMeshMidphaseCollision(
+	const PxReal toleranceLength,
+	const PxgContactManagerInput* PX_RESTRICT cmInputs,
+	const PxsCachedTransform* PX_RESTRICT transformCache,
+	const PxReal* PX_RESTRICT contactDistance,
+	const PxReal* PX_RESTRICT restDistances,
+	const PxgShape* PX_RESTRICT gpuShapes,
+	PxgParticleSystem* particleSystems,
+	PxNodeIndex* PX_RESTRICT shapeToRigidRemapTable,
+
+	PxgParticleContactWriter& writer
+)
+{
+	__shared__ __align__(16) char sMesh[sizeof(PxgTriangleMesh)];
+	PxgTriangleMesh& triangleMesh = reinterpret_cast<PxgTriangleMesh&>(*sMesh);
+
+	__shared__ __align__(16) char sSdfTexture[sizeof(SparseSDFTexture)];
+	SparseSDFTexture& sdfTexture = reinterpret_cast<SparseSDFTexture&>(*sSdfTexture);
+
+	__shared__ PxU32 triangleIndicesS[1024];
+
+
+	unsigned int cmIdx = blockIdx.x;
+	const bool isDiffuseParticlesThread = blockIdx.y == 1;
+
+
+	PxgShape trimeshShape, shape;
+	PxU32 trimeshTransformId, particleTransformId;
+	LoadShapePair<PxGeometryType::eTRIANGLEMESH, PxGeometryType::ePARTICLESYSTEM>(cmInputs, cmIdx, gpuShapes,
+		trimeshShape, trimeshTransformId, shape, particleTransformId);
+
+	PxsCachedTransform trimeshTransformCached;
+	PxsCachedTransform_ReadWarp(trimeshTransformCached, transformCache + trimeshTransformId);
+
+
+	const PxReal contactDist = contactDistance[trimeshTransformId] + contactDistance[particleTransformId];
+	const PxReal restDist = restDistances[cmIdx];
+
+	const PxNodeIndex rigidId = shapeToRigidRemapTable[trimeshTransformId];
+
+	const PxU32 particleSystemId = shape.particleOrSoftbodyId;
+	PxgParticleSystem& particleSystem = particleSystems[particleSystemId];
+
+	// If there is no diffuse particles or is not using PBD, skip this function
+	if (isDiffuseParticlesThread && (particleSystem.mCommonData.mMaxDiffuseParticles == 0))
+		return;
+
+	
+	if (threadIdx.x < 32)
+	{
+		readTriangleMesh(trimeshShape, triangleMesh);
+		__syncwarp();
+	}
+
+	float4* positions;
+	float4* positionsPrev;
+	if (isDiffuseParticlesThread)
+	{
+		positions = reinterpret_cast<float4*>(particleSystem.mDiffuseSortedPos_LifeTime);
+		positionsPrev = reinterpret_cast<float4*>(particleSystem.mDiffuseSortedOriginPos_LifeTime);
+	}
+	else
+	{
+		positions = reinterpret_cast<float4*>(particleSystem.mSortedPositions_InvMass);
+		positionsPrev = reinterpret_cast<float4*>(particleSystem.mSortedOriginPos_InvMass);
+	}
+	
+
+	if (threadIdx.x == 0)
+		sdfTexture.initialize(triangleMesh); //The texture is stored in shared memory - only one threads needs to initialize it
+	
+	__syncthreads();
+
+	PxU32 numParticles;
+	PxgParticleContactInfo* PX_RESTRICT oneWayContactInfos;
+	PxNodeIndex* PX_RESTRICT oneWayNodeIndices;
+	PxU32* PX_RESTRICT oneWayContactCounts;
+
+	if (isDiffuseParticlesThread)
+	{
+		numParticles = *particleSystem.mNumDiffuseParticles;
+		oneWayContactInfos = particleSystem.mDiffuseOneWayContactInfos;
+		oneWayNodeIndices = particleSystem.mDiffuseOneWayNodeIndex;
+		oneWayContactCounts = particleSystem.mDiffuseOneWayContactCount;
+	}
+	else
+	{
+		numParticles = particleSystem.mCommonData.mNumParticles;
+		oneWayContactInfos = particleSystem.mOneWayContactInfos;
+		oneWayNodeIndices = particleSystem.mOneWayNodeIndex;
+		oneWayContactCounts = particleSystem.mOneWayContactCount;
+	}
+
+	const PxU32* PX_RESTRICT gridParticleIndex = particleSystem.mSortedToUnsortedMapping;
+
+	PxReal cullScale = contactDist;
+	cullScale /= trimeshShape.scale.scale.abs().minElement();
+
+	const PxTransform& trimeshToWorld = trimeshTransformCached.transform;
+	const PxTransform particleToTrimeshTransform = trimeshToWorld.getInverse();
+
+	bool enableCCD = (particleSystem.mData.mFlags & PxParticleFlag::eENABLE_SPECULATIVE_CCD) > 0;
+
+	for (PxU32 i = 0; i < numParticles;)
+	{
+		PxU32 nbFoundVertices = findInterestingVertices<32, 1024>(numParticles, positions,
+			trimeshShape.scale, cullScale, sdfTexture, particleToTrimeshTransform, i, triangleIndicesS);
+
+		bool candidateContact = false;
+		PxVec3 pos;
+		PxVec3 normal;
+		PxReal distance;
+		PxReal finalSep;
+		PxVec3 posPrev;
+		PxReal startDist; //distPrev
+
+		PxU32 ind = threadIdx.x < nbFoundVertices ? triangleIndicesS[threadIdx.x] : numParticles;
+
+		if (ind < numParticles)
+		{
+			pos = particleToTrimeshTransform.transform(PxLoad3(positions[ind]));
+			PxVec3 v = shape2Vertex(pos, trimeshShape.scale.scale, trimeshShape.scale.rotation);
+
+			posPrev = particleToTrimeshTransform.transform(PxLoad3(positionsPrev[ind]));
+
+
+			PxVec3 dirPrev;
+			startDist = doVertexSDFCollision(sdfTexture, shape2Vertex(posPrev, trimeshShape.scale.scale, trimeshShape.scale.rotation), dirPrev, cullScale);
+			PxVec3 dir;
+			distance = doVertexSDFCollision(sdfTexture, v, dir, cullScale);
+
+			if (distance < cullScale || startDist < cullScale)
+			{
+				dir = vertex2ShapeNormalVector(dir, trimeshShape.scale.scale, trimeshShape.scale.rotation);
+				dirPrev = vertex2ShapeNormalVector(dirPrev, trimeshShape.scale.scale, trimeshShape.scale.rotation);
+
+				PxReal m = dir.magnitudeSquared();
+				if (m > 0.0f)
+				{
+					m = 1.0f / PxSqrt(m);
+					distance = distance * m;
+					startDist = startDist * m;
+					dir = dir * m;	
+					dirPrev = dirPrev * m;
+				}
+
+				/*candidateContact = distance < contactDist;
+
+				if (candidateContact)
+				{
+					sep = sep;
+					normal = dir;
+				}*/
+
+				if (startDist < contactDist)
+				{
+					normal = dirPrev;
+					finalSep = startDist - restDist;					
+					candidateContact = true;
+				}
+				if (!candidateContact && distance > 0.f)
+				{
+					if (distance < contactDist)
+					{
+						normal = dir;
+						finalSep = (distance - restDist) + normal.dot(posPrev - pos);
+						candidateContact = true;
+					}
+				}
+				if (enableCCD && !candidateContact)
+				{
+					if (distance < 0.f)
+					{
+						PxVec3 d = (pos - posPrev).getNormalized();
+
+						PxReal displacement = -PxMax(0.f, startDist - restDist) / (dir.dot(d));
+
+						PxVec3 pt = posPrev + d * displacement;
+
+						distance = doVertexSDFCollision(sdfTexture, shape2Vertex(particleToTrimeshTransform.transform(pt), trimeshShape.scale.scale, trimeshShape.scale.rotation), dir, cullScale);
+						dir = vertex2ShapeNormalVector(dir, trimeshShape.scale.scale, trimeshShape.scale.rotation);
+						if (m > 0.0f)
+						{
+							distance = distance * m;
+							dir = dir * m;
+						}
+
+						if (distance < contactDist)
+						{
+							normal = dir;
+							finalSep = (distance - restDist) + normal.dot(posPrev - pt);
+							candidateContact = true;
+						}
+					}
+				}
+			}
+
+			const PxU64 particleMask = PxEncodeParticleIndex(particleSystemId, gridParticleIndex[ind]);
+
+			if (!isDiffuseParticlesThread && find(particleSystem, rigidId.getInd(), particleMask))
+				candidateContact = false;
+		}
+		
+
+		if (rigidId.isStaticBody())
+		{
+			if (candidateContact)
+			{
+				PxU32 startIndex = PxU32(atomicAdd((PxI32*)&oneWayContactCounts[ind], 1));
+
+				const PxU32 outputIndex = ind + (startIndex) * numParticles;
+				if (startIndex < PxgParticleContactInfo::MaxStaticContactsPerParticle)
+				{
+					oneWayContactInfos[outputIndex].mNormal_PenW = make_float4(normal.x, normal.y, normal.z, finalSep/*sep - restDist + normal.dot(posPrev - pos)*/);
+					oneWayNodeIndices[outputIndex] = rigidId;
+					candidateContact = false; //Contact is stored, so it is not a candidate contact anymore
+				}
+			}
+		}
+		
+
+		PxU32 contactIndex = globalScanExclusive<32>(candidateContact, writer.numTotalContacts);
+
+		if (candidateContact && !isDiffuseParticlesThread)
+		{
+			normal = trimeshToWorld.rotate(normal);
+
+			PxVec4 normalPen(normal.x, normal.y, normal.z, finalSep/*sep - restDist + normal.dot(posPrev - pos)*/);
+
+			writer.writeContact(contactIndex, normalPen,
+				PxEncodeParticleIndex(particleSystemId, ind), rigidId);
+		}
+	}
+}
+
+
+
+
+template<unsigned int WarpsPerBlock>
+__device__ static inline void psMeshMidphaseCollision(
+	const bool isTGS,
+	const PxReal toleranceLength,
+	const PxgContactManagerInput* PX_RESTRICT cmInputs,
+	const PxsCachedTransform* PX_RESTRICT transformCache,
+	const PxReal* PX_RESTRICT contactDistance,
+	const PxReal* PX_RESTRICT restDistances,
+	const PxgShape* PX_RESTRICT gpuShapes,
+	PsMidphaseScratch*	s_warpScratch,
+	PxgParticleSystem* particleSystems,
+	PxNodeIndex* PX_RESTRICT shapeToRigidRemapTable,
+	PxgParticleContactWriter& writer
+)
+{
+	//thread index in warp
+	const unsigned int idxInWarp = threadIdx.x;
+	//wrap index
+	const unsigned int warpIdx = threadIdx.y;//(idx >> LOG2_WARP_SIZE);
+											 //wrap index in block
+											 //const unsigned int idx = idxInWarp + warpIdx * WARP_SIZE;
+	
+	unsigned int cmIdx = blockIdx.y;
+	const bool isDiffuseParticlesThread = blockIdx.z == 1;
+	
+
+	PxgShape trimeshShape, shape;
+	PxU32 trimeshTransformId, particleTransformId;
+	LoadShapePair<PxGeometryType::eTRIANGLEMESH, PxGeometryType::ePARTICLESYSTEM>(cmInputs, cmIdx, gpuShapes,
+		trimeshShape, trimeshTransformId, shape, particleTransformId);
+
+	PxsCachedTransform trimeshTransformCached;
+	PxsCachedTransform_ReadWarp(trimeshTransformCached, transformCache + trimeshTransformId);
+
+	const PxReal contactDist = contactDistance[trimeshTransformId] + contactDistance[particleTransformId];
+	const PxReal restDist = restDistances[cmIdx];
+		
+	const PxNodeIndex rigidId = shapeToRigidRemapTable[trimeshTransformId];
+
+	const PxU32 particleSystemId = shape.particleOrSoftbodyId;
+	PxgParticleSystem& particleSystem = particleSystems[particleSystemId];
+
+	// If there is no diffuse particles or is not using PBD, skip this function
+	if (isDiffuseParticlesThread && (particleSystem.mCommonData.mMaxDiffuseParticles == 0))
+		return;
+
+	const PxU8 * trimeshGeomPtr = reinterpret_cast<const PxU8 *>(trimeshShape.hullOrMeshPtr);
+
+	if (idxInWarp == 0)
+	{
+		s_warpScratch->meshToWorld = trimeshTransformCached.transform;
+		s_warpScratch->trimeshScale = trimeshShape.scale;
+		
+		readTriangleMesh(trimeshGeomPtr, s_warpScratch->bv32PackedNodes, s_warpScratch->trimeshVerts, s_warpScratch->trimeshTriIndices);
+		
+		if (isDiffuseParticlesThread)
+		{
+			s_warpScratch->currentPositions = reinterpret_cast<float4*>(particleSystem.mDiffuseSortedOriginPos_LifeTime);
+			s_warpScratch->predictedPositions = reinterpret_cast<float4*>(particleSystem.mDiffuseSortedPos_LifeTime);
+		}
+		else
+		{
+			s_warpScratch->currentPositions = reinterpret_cast<float4*>(particleSystem.mSortedOriginPos_InvMass);
+			s_warpScratch->predictedPositions = reinterpret_cast<float4*>(particleSystem.mSortedPositions_InvMass);
+		}
+		s_warpScratch->enableCCD = (particleSystem.mData.mFlags & PxParticleFlag::eENABLE_SPECULATIVE_CCD) > 0;
+	}
+		
+	__syncwarp();
+
+	PxU32 numParticles;
+	PxgParticleContactInfo* PX_RESTRICT oneWayContactInfos;
+	PxNodeIndex* PX_RESTRICT oneWayNodeIndices;
+	PxU32* PX_RESTRICT oneWayContactCounts;
+
+	if (isDiffuseParticlesThread)
+	{
+		numParticles = *particleSystem.mNumDiffuseParticles;
+		oneWayContactInfos = particleSystem.mDiffuseOneWayContactInfos;
+		oneWayNodeIndices = particleSystem.mDiffuseOneWayNodeIndex;
+		oneWayContactCounts = particleSystem.mDiffuseOneWayContactCount;
+	}
+	else
+	{
+		numParticles = particleSystem.mCommonData.mNumParticles;
+		oneWayContactInfos = particleSystem.mOneWayContactInfos;
+		oneWayNodeIndices = particleSystem.mOneWayNodeIndex;
+		oneWayContactCounts = particleSystem.mOneWayContactCount;
+	}
+
+	const PxU32 nbWarpsRequired = (numParticles + WARP_SIZE - 1) / WARP_SIZE;
+	const PxU32 totalNumWarps = PxgParticleSystemKernelGridDim::PS_MESH_COLLISION * PS_MIDPHASE_COLLISION_WAPRS_PER_BLOCK;
+	const PxU32 nbIterationsPerWarps = (nbWarpsRequired + totalNumWarps - 1) / totalNumWarps;
+	const PxU32* PX_RESTRICT gridParticleIndex = particleSystem.mSortedToUnsortedMapping;
+
+	for (PxU32 i = 0; i < nbIterationsPerWarps; ++i)
+	{
+		const PxU32 workIndex = i + (warpIdx + WarpsPerBlock * blockIdx.x) * nbIterationsPerWarps;
+		
+		if (workIndex < nbWarpsRequired)
+		{
+			const PxU32 particleIndex = idxInWarp + workIndex * WARP_SIZE;
+			const PxU64 particleMask = PxEncodeParticleIndex(particleSystemId, gridParticleIndex[particleIndex]);
+
+			if (!isDiffuseParticlesThread && find(particleSystem, rigidId.getInd(), particleMask))
+				return;
+
+			PsTreeContactGenTraverser traverser(
+				s_warpScratch,
+				numParticles,
+				particleIndex,
+				isDiffuseParticlesThread,
+				isTGS,
+				contactDist
+			);
+			bv32TreeTraversal<PsTreeContactGenTraverser, WarpsPerBlock>(s_warpScratch->bv32PackedNodes, s_warpScratch->sBv32Nodes, traverser);
+			
+			traverser.finalizeFullWarp(idxInWarp, particleSystemId,
+				rigidId,
+				oneWayContactInfos,
+				oneWayNodeIndices,
+				oneWayContactCounts,
+				writer,				
+				isDiffuseParticlesThread,
+				restDist
+			);
+		}
+	}
+}
+
+
+extern "C" __global__
+__launch_bounds__(1024, 1)
+void ps_sdfMeshCollisonLaunch(
+	const PxReal								tolerenceLength,
+	const PxgContactManagerInput* PX_RESTRICT	cmInputs,
+	const PxsCachedTransform* PX_RESTRICT		transformCache,
+	const PxReal* PX_RESTRICT					contactDistance,
+	const PxReal* PX_RESTRICT					restDistances,
+	const PxgShape* PX_RESTRICT					gpuShapes,
+
+	PxgParticleSystem* PX_RESTRICT				particleSystems,						//output
+	PxNodeIndex* PX_RESTRICT					shapeToRigidRemapTable,
+
+	PxgParticleContactWriter writer
+)
+{
+	psSdfMeshMidphaseCollision(
+		tolerenceLength,
+		cmInputs,
+		transformCache,
+		contactDistance,
+		restDistances,
+		gpuShapes,
+		particleSystems,
+		shapeToRigidRemapTable,
+		writer
+		);
+}
+
+
+
+extern "C" __global__
+//__launch_bounds__(MIDPHASE_WARPS_PER_BLOCK * WARP_SIZE, 4)
+void ps_meshCollisonLaunch(
+	const bool									isTGS,
+	const PxReal								tolerenceLength,
+	const PxgContactManagerInput* PX_RESTRICT	cmInputs,
+	const PxsCachedTransform* PX_RESTRICT		transformCache,
+	const PxReal* PX_RESTRICT					contactDistance,
+	const PxReal* PX_RESTRICT					restDistances,
+	const PxgShape* PX_RESTRICT					gpuShapes,
+	PxgParticleSystem* PX_RESTRICT				particleSystems,						//output
+	PxNodeIndex* PX_RESTRICT					shapeToRigidRemapTable,
+	PxgParticleContactWriter					writer
+)
+{
+	__shared__ __align__(16) PxU8 scratchMem[PS_MIDPHASE_COLLISION_WAPRS_PER_BLOCK][sizeof(PsMidphaseScratch)];
+
+	psMeshMidphaseCollision<PS_MIDPHASE_COLLISION_WAPRS_PER_BLOCK>(
+		isTGS,
+		tolerenceLength,
+		cmInputs,
+		transformCache,
+		contactDistance,
+		restDistances,
+		gpuShapes,
+		(PsMidphaseScratch*)&scratchMem[threadIdx.y],
+		particleSystems,
+		shapeToRigidRemapTable,
+		writer
+		);
+}
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/sdfCollision.cuh
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/sdfCollision.cuh
@@ -0,0 +1,765 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. 
+
+#ifndef __SDF_COLLISION_CUH__
+#define __SDF_COLLISION_CUH__
+
+#include "cudaNpCommon.h"
+#include "foundation/PxVec3.h"
+#include "geometry/PxMeshScale.h"
+#include "GuDistancePointTetrahedron.h"
+#include "GuTriangleRefinement.h"
+#include "triangleMesh.cuh"
+#include "utils.cuh"
+
+namespace physx
+{
+
+struct PX_ALIGN_PREFIX(16) SparseSDFTexture
+{
+	//Dense info - for coarse resolution background grid	
+	PxVec3 sdfBoxLower;
+	PxReal sdfDx;
+	PxVec3 sdfBoxHigher;
+	PxReal invSdfDx;
+	CUtexObject texture;
+
+	//Sparse info - for high resolution in narrow band around isolevel 0
+	CUtexObject textureSubgrids;
+
+	uint3 coarseSize;
+	PxU32 subgridSize;
+
+	const PxU32* subgridStartSlots;
+
+	PxReal fineToCoarse; //Inverse of subgridSize
+
+	uint4 sdfDims;
+	PxReal subgridsMinSdfValue;
+	PxReal subgridsSdfValueRange;
+
+	PX_INLINE __device__ void initialize(const PxgTriangleMesh& mesh)
+	{
+		texture = mesh.mTexObject;
+
+		const PxReal margin = mesh.subgridSize == 0 ? 0.5f : 0.0f;
+		sdfBoxLower = mesh.meshLower + margin * PxVec3(mesh.spacing);
+		sdfBoxHigher = mesh.meshLower + PxVec3((mesh.sdfDims.x - margin) * mesh.spacing, (mesh.sdfDims.y - margin) * mesh.spacing, (mesh.sdfDims.z - margin) * mesh.spacing);
+		invSdfDx = 1.f / mesh.spacing;
+		sdfDx = mesh.spacing;
+
+		textureSubgrids = mesh.mTexObjectSparse;
+		subgridStartSlots = mesh.subgridStarts;
+		sdfDims = mesh.sdfDims;
+		coarseSize.x = mesh.sdfDims.x / PxMax(1u, mesh.subgridSize);
+		coarseSize.y = mesh.sdfDims.y / PxMax(1u, mesh.subgridSize);
+		coarseSize.z = mesh.sdfDims.z / PxMax(1u, mesh.subgridSize);
+		//sdfDx = 1.0f / invSdfDx; // dx;
+		subgridSize = mesh.subgridSize;
+		fineToCoarse = 1.0f / mesh.subgridSize;
+
+		subgridsMinSdfValue = mesh.subgridsMinSdfValue;
+		subgridsSdfValueRange = mesh.subgridsMaxSdfValue - mesh.subgridsMinSdfValue;
+
+		//if (threadIdx.x == 0)
+		//	printf("%f, %f\n", mesh.subgridsMinSdfValue, mesh.subgridsMaxSdfValue);
+	}
+
+private:
+	static PX_INLINE __device__ PxU32 idx(PxU32 x, PxU32 y, PxU32 z, PxU32 width, PxU32 height)
+	{
+		return z * width * height + y * width + x;
+	}
+	static PX_INLINE __device__ void applySubgridStart(PxU32 triple, PxVec3& f, const PxU32 subgridSize)
+	{
+		f.x += (triple & 1023) * (subgridSize + 1);
+		triple = triple >> 10;
+		f.y += (triple & 1023) * (subgridSize + 1);
+		triple = triple >> 10;
+		f.z += (triple & 1023) * (subgridSize + 1);
+	}
+
+	PX_INLINE __device__ PxReal applySubgridSdfScale(PxReal rawSdfValue) const
+	{
+		return rawSdfValue * subgridsSdfValueRange + subgridsMinSdfValue;
+	}
+
+public:
+	PX_INLINE __device__ PxReal Sample(PxVec3 f) const
+	{
+		if (subgridSize == 0)
+		{
+			//Dense sdf
+			return tex3D<float>(texture, f.x + 0.5f, f.y + 0.5f, f.z + 0.5f);
+		}
+		else
+		{
+			const PxU32 xBase = PxClamp(PxU32(f.x * fineToCoarse), 0u, coarseSize.x - 1);
+			const PxU32 yBase = PxClamp(PxU32(f.y * fineToCoarse), 0u, coarseSize.y - 1);
+			const PxU32 zBase = PxClamp(PxU32(f.z * fineToCoarse), 0u, coarseSize.z - 1);
+
+			const PxU32 i = idx(xBase, yBase, zBase, coarseSize.x, coarseSize.y);
+			const PxU32 startSlot = subgridStartSlots[i];
+			if (startSlot == 0xFFFFFFFF)
+			{
+				//Evaluate coarse
+				f *= fineToCoarse;
+				//+0.5: Align with texel center
+				//https://forums.developer.nvidia.com/t/understanding-cuda-texture-2d-linear-interpolation/213924	
+				return tex3D<float>(texture, f.x + 0.5f, f.y + 0.5f, f.z + 0.5f);
+			}
+			else
+			{
+				f.x = PxClamp(f.x - xBase * subgridSize, 0.0f, PxReal(subgridSize + 1));
+				f.y = PxClamp(f.y - yBase * subgridSize, 0.0f, PxReal(subgridSize + 1));
+				f.z = PxClamp(f.z - zBase * subgridSize, 0.0f, PxReal(subgridSize + 1));
+
+				//Transform to subgrid storage location in 3D texture
+				applySubgridStart(startSlot, f, subgridSize);
+
+				//+0.5: Align with texel center
+				//https://forums.developer.nvidia.com/t/understanding-cuda-texture-2d-linear-interpolation/213924	
+				PxReal v = tex3D<float>(textureSubgrids, f.x + 0.5f, f.y + 0.5f, f.z + 0.5f);
+				
+				return applySubgridSdfScale(v);
+			}
+		}
+	}
+}
+PX_ALIGN_SUFFIX(16);
+
+template<typename T>
+PX_INLINE __device__ PxReal PxSdfDistance(const T& texture, const PxVec3& localPos)
+{
+	// clamp to SDF support
+	const PxVec3 clampedGridPt = localPos.maximum(texture.sdfBoxLower).minimum(texture.sdfBoxHigher);
+
+	PxReal diffMag = (localPos - clampedGridPt).magnitude();
+	PxVec3 f = (clampedGridPt - texture.sdfBoxLower) * texture.invSdfDx;
+
+	return texture.Sample(f) + diffMag;
+}
+
+template<typename T>
+PX_INLINE __device__ PxVec3 PxVolumeGrad(const T& texture, const PxVec3& localPos)
+{
+	PxVec3 grad;
+
+	const PxVec3 clampedGridPt = localPos.maximum(texture.sdfBoxLower).minimum(texture.sdfBoxHigher);
+
+	PxVec3 f = (clampedGridPt - texture.sdfBoxLower) * texture.invSdfDx;
+
+	if (f.x >= 1.0f && f.x <= texture.sdfDims.x - 2.0f &&
+		f.y >= 1.0f && f.y <= texture.sdfDims.y - 2.0f &&
+		f.z >= 1.0f && f.z <= texture.sdfDims.z - 2.0f)
+	{
+		grad.x = texture.Sample(PxVec3(f.x + 1.0f, f.y, f.z)) -
+			texture.Sample(PxVec3(f.x - 1.0f, f.y, f.z));
+		grad.y = texture.Sample(PxVec3(f.x, f.y + 1.0f, f.z)) -
+			texture.Sample(PxVec3(f.x, f.y - 1.0f, f.z));
+		grad.z = texture.Sample(PxVec3(f.x, f.y, f.z + 1.0f)) -
+			texture.Sample(PxVec3(f.x, f.y, f.z - 1.0f));
+		return grad;
+	}
+
+	grad.x = PxSdfDistance(texture, localPos + PxVec3(texture.sdfDx, 0.f, 0.f)) -
+		PxSdfDistance(texture, localPos - PxVec3(texture.sdfDx, 0.f, 0.f));
+	grad.y = PxSdfDistance(texture, localPos + PxVec3(0.f, texture.sdfDx, 0.f)) -
+		PxSdfDistance(texture, localPos - PxVec3(0.f, texture.sdfDx, 0.f));
+	grad.z = PxSdfDistance(texture, localPos + PxVec3(0.f, 0.f, texture.sdfDx)) -
+		PxSdfDistance(texture, localPos - PxVec3(0.f, 0.f, texture.sdfDx));
+
+	return grad;
+}
+
+template<typename T>
+PX_INLINE __device__ PxReal PxSdfSampleWithGrad(const T& texture, const PxVec3& localPos,
+	const PxVec3& sdfGradient, PxVec3& dir, PxReal tolerance = PX_MAX_F32)
+{
+	const PxVec3 clampedGridPt = localPos.maximum(texture.sdfBoxLower).minimum(texture.sdfBoxHigher);
+
+	PxVec3 diff = (localPos - clampedGridPt);
+	/*
+	const PxReal diffMag = diff.magnitudeSquared();
+	if (diffMag > tolerance*tolerance)
+		return PX_MAX_F32;*/
+
+	PxVec3 f = (clampedGridPt - texture.sdfBoxLower) * texture.invSdfDx;
+
+	PxReal dist = texture.Sample(f);
+
+	if (dist < tolerance)
+	{
+		// estimate the contact direction based on the SDF
+		// if we are outside the SDF support, add in the distance to the point
+		dir = (sdfGradient.getNormalized() * PxAbs(dist) + diff).getNormalized();
+		return dist + dir.dot(diff);
+	}
+
+	return PX_MAX_F32;
+}
+
+static __device__ PxReal doVertexSDFCollision(SparseSDFTexture& texture, const PxVec3& v,
+	PxVec3& dir, PxReal tolerance)
+{
+	PxVec3 sdfGradient = PxVolumeGrad(texture, v);
+
+	if (sdfGradient.normalize() == 0.0f)
+	{
+		//We ran into a discontinuity e. g. the exact center of a cube
+		//Just pick an arbitrary gradient of unit length to move out of the discontinuity
+		sdfGradient = PxVec3(0.5448762f, 0.672269f, 0.398825f).getNormalized();
+	}
+
+	return PxSdfSampleWithGrad(texture, v, sdfGradient, dir, tolerance);
+}
+
+
+//#define gr 1.6180339887498948482045868343656381177203091798057628621354486227f
+#define invGr 0.6180339887498948482045868343656381177203091798057628621354486227f
+
+static __device__ PxReal doSegmentSDFCollision(SparseSDFTexture& texture, const PxVec3& v0, const PxVec3& v1,
+	PxVec3& point, PxVec3& dir, PxReal tolerance, PxReal& t)
+{
+	//Golden section search
+	PxReal a = 0.0f;
+	PxReal b = 1.0f;
+	PxReal c = b - (b - a) * invGr;
+	PxReal d = a + (b - a) * invGr;
+	for (PxU32 iter = 0; iter < 16; ++iter)
+	{
+		PxVec3 pC = (1.0f - c) * v0 + c * v1;
+		PxReal distC = PxSdfDistance(texture, pC);
+		
+		PxVec3 pD = (1.0f - d) * v0 + d * v1;
+		PxReal distD = PxSdfDistance(texture, pD);
+
+		if (distC < distD)
+			b = d;
+		else
+			a = c;
+
+		c = b - (b - a) * invGr;
+		d = a + (b - a) * invGr;
+	}
+
+	t = 0.5f * (a + b);
+	PxVec3 p = (1.0f - t) * v0 + t * v1;
+
+	PxVec3 sdfGradient = PxVolumeGrad(texture, p);
+	if (sdfGradient.normalize() == 0.0f)
+	{
+		//We ran into a discontinuity e. g. the exact center of a cube
+		//Just pick an arbitrary gradient of unit length to move out of the discontinuity
+		sdfGradient = PxVec3(0.5448762f, 0.672269f, 0.398825f).getNormalized();
+	}
+
+	point = p;
+	return PxSdfSampleWithGrad(texture, p, sdfGradient, dir, tolerance);
+}
+
+static __device__ PxReal doTetrahedronSDFCollision(SparseSDFTexture& texture, const PxVec3& v0, const PxVec3& v1, const PxVec3& v2, const PxVec3& v3,
+	PxVec3& point, PxVec3& dir, PxReal tolerance/*, PxVec4& uvw*/)
+{
+	PxVec3 center = (v0 + v1 + v2 + v3) * 0.25f;
+	PxVec3 p = center;
+
+	PxReal dist = PxSdfDistance(texture, p);
+
+	PxReal d0 = PxSdfDistance(texture, v0);
+	PxReal d1 = PxSdfDistance(texture, v1);
+	PxReal d2 = PxSdfDistance(texture, v2);
+	PxReal d3 = PxSdfDistance(texture, v3);
+
+	PxVec4 uvw = PxVec4(0.f);
+
+	if (d0 < d1 && d0 < d2 && d0 < d3 && d0 < dist)
+	{
+		dist = d0;
+		p = v0;
+		uvw.x = 1.f;
+	}
+	else if (d1 < d2 && d1 < d3 && d1 < dist)
+	{
+		dist = d1;
+		p = v1;
+		uvw.y = 1.f;
+	}
+	else if (d2 < d3 && d2 < dist)
+	{
+		dist = d2;
+		p = v2;
+		uvw.z = 1.f;
+	}
+	else if (d3 < dist)
+	{
+		dist = d3;
+		p = v3;
+		uvw.w = 1.f;
+	}
+	else
+		uvw = PxVec4(0.25f);
+
+	PxReal difference = PxSqrt(
+		PxMax(
+			PxMax((v0 - p).magnitudeSquared(), (v1 - p).magnitudeSquared()),
+			PxMax((v2 - p).magnitudeSquared(), (v3 - p).magnitudeSquared())));
+
+	const PxReal eps = 1e-4;
+	const PxReal toleranceSq = 1e-3f*1e-3f;
+
+	PxVec3 sdfGradient;
+	PxReal step = 1.f / (2.f*difference);
+
+	for (PxU32 iter = 0; iter < 16; ++iter)
+	{
+		sdfGradient = PxVolumeGrad(texture, p);
+
+		// why do we normalize the gradient here?
+		if (sdfGradient.normalize() == 0.0f)
+		{
+			//We ran into a discontinuity e. g. the exact center of a cube
+			//Just pick an arbitrary gradient of unit length to move out of the discontinuity
+			sdfGradient = PxVec3(0.5448762f, 0.672269f, 0.398825f).getNormalized();
+		}
+
+		PxReal dfdu = sdfGradient.dot(v0 - p);
+		PxReal dfdv = sdfGradient.dot(v1 - p);
+		PxReal dfdw = sdfGradient.dot(v2 - p);
+		PxReal dfd4 = sdfGradient.dot(v3 - p);
+
+		PxVec4 newUVW;
+
+		{
+			newUVW = uvw;
+
+			newUVW.x -= step * dfdu;
+			newUVW.y -= step * dfdv;
+			newUVW.z -= step * dfdw;
+			newUVW.w -= step * dfd4;
+
+			step = step * 0.8f;
+
+
+			if (((newUVW.x <= -eps || newUVW.x >= 1.f + eps) || (newUVW.y <= -eps || newUVW.y >= 1.f + eps) ||
+				(newUVW.z <= -eps || newUVW.z >= 1.f + eps) || (newUVW.w <= -eps || newUVW.w >= 1.f + eps)))
+			{
+				PxVec3 cp = Gu::closestPtPointTetrahedron(newUVW.getXYZ(), PxVec3(1.0f, 0.0f, 0.0f), PxVec3(0.0f, 1.0f, 0.0f), PxVec3(0.f, 0.f, 1.f), PxVec3(0.0f));
+				newUVW.x = cp.x;
+				newUVW.y = cp.y;
+				newUVW.z = cp.z;
+				newUVW.w = 1.0f - cp.x - cp.y - cp.z; //TODO: Veryfy this logic
+
+				assert(newUVW.w >= -eps && newUVW.w <= 1.f + eps);
+			}
+
+			p = v0 * newUVW.x + v1 * newUVW.y + v2 * newUVW.z + v3 * newUVW.w;
+
+			if ((uvw - newUVW).magnitudeSquared() < toleranceSq)
+			{
+				/*if(iter != 0)
+					printf("Iter = %i\n", iter);*/
+				break;
+			}
+
+			uvw = newUVW;
+		}
+	}
+
+	sdfGradient = PxVolumeGrad(texture, p);
+
+	point = p;
+
+	dist = PxSdfSampleWithGrad(texture, p, sdfGradient, dir, tolerance);
+	return dist;
+}
+
+static __device__ PxReal doTriangleSDFCollision(SparseSDFTexture& texture, const PxVec3& v0, const PxVec3& v1, const PxVec3& v2,
+	PxVec3& point, PxVec3& dir, PxReal tolerance)
+{
+	const PxReal third = 1.f / 3.f;
+	PxVec3 center = (v0 + v1 + v2) * third;
+	PxVec3 p = center;
+
+	PxReal dist = PxSdfDistance(texture, p);
+
+	PxReal d0 = PxSdfDistance(texture, v0);
+	PxReal d1 = PxSdfDistance(texture, v1);
+	PxReal d2 = PxSdfDistance(texture, v2);
+
+	//PxVec3 nor = (v1 - v0).cross(v2 - v0);
+
+	PxVec3 uvw(0.f);
+
+	// choose starting iterate among centroid and triangle vertices
+	if (d0 < d1 && d0 < d2 && d0 < dist)
+	{
+		p = v0;
+		uvw.x = 1.f;
+	}
+	else if (d1 < d2 && d1 < dist)
+	{
+		p = v1;
+		uvw.y = 1.f;
+	}
+	else if (d2 < dist)
+	{
+		p = v2;
+		uvw.z = 1.f;
+	}
+	else
+		uvw = PxVec3(third);
+
+	PxReal difference = PxSqrt(PxMax((v0 - p).magnitudeSquared(), PxMax((v1 - p).magnitudeSquared(),
+		(v2 - p).magnitudeSquared())));
+
+
+	const PxReal toleranceSq = 1e-3f*1e-3f;
+
+	PxVec3 sdfGradient;
+	PxReal step = 1.f / (2.f*difference);
+
+	for (PxU32 iter = 0; iter < 16; ++iter)
+	{
+		sdfGradient = PxVolumeGrad(texture, p);
+
+		if (sdfGradient.normalize() == 0.0f)
+		{
+			//We ran into a discontinuity e. g. the exact center of a cube
+			//Just pick an arbitrary gradient of unit length to move out of the discontinuity
+			sdfGradient = PxVec3(0.571846586f, 0.705545099f, 0.418566116f);
+		}
+
+		PxReal dfdu = sdfGradient.dot(v0 - p);
+		PxReal dfdv = sdfGradient.dot(v1 - p);
+		PxReal dfdw = sdfGradient.dot(v2 - p);
+
+		PxVec3 newUVW;
+
+		{
+			newUVW = uvw;
+
+			newUVW.x -= step * dfdu;
+			newUVW.y -= step * dfdv;
+			newUVW.z -= step * dfdw;
+
+			step = step * 0.8f;
+
+			newUVW = Gu::closestPtPointBaryTriangle(newUVW);
+
+			p = v0 * newUVW.x + v1 * newUVW.y + v2 * newUVW.z;
+
+			if ((uvw - newUVW).magnitudeSquared() < toleranceSq)
+			{
+				/*if(iter != 0)
+					printf("Iter = %i\n", iter);*/
+				break;
+			}
+
+			uvw = newUVW;
+		}
+	}
+
+	sdfGradient = PxVolumeGrad(texture, p);
+
+	point = p;
+
+	dist = PxSdfSampleWithGrad(texture, p, sdfGradient, dir, tolerance);
+	return dist;
+}
+
+//Returns the new number of elements in the buffer
+//T should be of type PxU32 or a multiple of it (uint2, uint2, uint4)
+template<typename T, PxU32 NbWarps, const PxU32 TargetCount>
+__device__ PxU32 addIndexToBuffer(bool threadEmitsElement, T index, PxU32 nbElementsInBuffer, T* sharedMemoryBuffer, PxU32& offset)
+{
+	PxU32 total;
+	PxU32 exclusiveSum = threadBlockScanExclusive<NbWarps>(threadEmitsElement, total);
+
+	if (threadEmitsElement)
+	{
+		PxU32 writeOffset = nbElementsInBuffer + exclusiveSum;
+
+		if (writeOffset < TargetCount)
+		{
+			sharedMemoryBuffer[writeOffset] = index;
+		}
+		if (writeOffset == (TargetCount - 1))
+		{
+			//Get the first element
+			offset = ((PxU32*)(&index))[0] + 1; //The thread that added the last element to the sharedIndices writes the new offset
+		}
+	}
+
+	return min(total + nbElementsInBuffer, TargetCount);
+}
+
+PX_FORCE_INLINE __device__ void getTriangleVertices(const uint4& triangle, const float4* meshVerts, const PxMeshScale& scale, const PxTransform t,
+	PxU32 encodedSubIndex, PxVec3& a, PxVec3& b, PxVec3& c)
+{
+	a = t.transform(vertex2Shape(PxLoad3(meshVerts[triangle.x]), scale.scale, scale.rotation));
+	b = t.transform(vertex2Shape(PxLoad3(meshVerts[triangle.y]), scale.scale, scale.rotation));
+	c = t.transform(vertex2Shape(PxLoad3(meshVerts[triangle.z]), scale.scale, scale.rotation));
+
+	if (encodedSubIndex > 0)
+		Gu::getSubTriangleEncoded(a, b, c, encodedSubIndex);
+}
+
+PX_FORCE_INLINE __device__ void getTriangleVertices(const uint4* meshInd, const float4* meshVerts, const PxMeshScale& scale, const PxTransform t,
+	PxU32 triangleIndex, PxU32 encodedSubIndex, PxVec3& a, PxVec3& b, PxVec3& c)
+{
+	uint4 inds = meshInd[triangleIndex];
+	getTriangleVertices(inds, meshVerts, scale, t, encodedSubIndex, a, b, c);
+}
+
+PX_FORCE_INLINE __device__ void getTriangleVertices(const PxgTriangleMesh& mesh, const PxMeshScale& scale, const PxTransform t,
+	PxU32 triangleIndex, PxU32 encodedSubIndex, PxVec3& a, PxVec3& b, PxVec3& c)
+{
+	uint4 inds = mesh.indices[triangleIndex];
+	a = t.transform(vertex2Shape(PxLoad3(mesh.trimeshVerts[inds.x]), scale.scale, scale.rotation));
+	b = t.transform(vertex2Shape(PxLoad3(mesh.trimeshVerts[inds.y]), scale.scale, scale.rotation));
+	c = t.transform(vertex2Shape(PxLoad3(mesh.trimeshVerts[inds.z]), scale.scale, scale.rotation));
+
+	if (encodedSubIndex > 0)
+		Gu::getSubTriangleEncoded(a, b, c, encodedSubIndex);
+}
+
+
+PX_FORCE_INLINE __device__ PxVec3 getVertex(const float4* meshVerts, const PxMeshScale& scale, const PxTransform t, PxU32 vertexIndex)
+{
+	return t.transform(vertex2Shape(PxLoad3(meshVerts[vertexIndex]), scale.scale, scale.rotation));
+}
+
+PX_FORCE_INLINE __device__ PxVec3 getVertex(const PxgTriangleMesh& mesh, const PxMeshScale& scale, const PxTransform t, PxU32 vertexIndex)
+{
+	return t.transform(vertex2Shape(PxLoad3(mesh.trimeshVerts[vertexIndex]), scale.scale, scale.rotation));
+}
+
+//Input of surface hint is optional
+template<PxU32 NbWarps, const PxU32 TargetCount>
+__device__ PxU32 findInterestingTets(PxU32 nbTets, const uint4* tetIndices, const float4* tetVerts,
+	const PxMeshScale& scale1, const PxReal tolerance, const SparseSDFTexture& sdfMesh1,
+	const PxTransform& aToB, PxU32& startIndex, PxU32* sharedIndices, const PxU8* surfaceHint = NULL)
+{
+	__shared__ PxU32 offset;
+
+	if (threadIdx.x == 0)
+		offset = startIndex;
+
+	PxU32 sharedIndicesCount = 0;
+	for (PxU32 i = startIndex; (i < nbTets); )
+	{
+		PxU32 ind = i + threadIdx.x;
+		i += blockDim.x;
+
+		bool addToBuffer = false;
+		if (ind < nbTets)
+		{
+			if (!surfaceHint || surfaceHint[ind])
+			{
+				const uint4 tet = tetIndices[ind];
+				const PxVec3 a = aToB.transform(PxLoad3(tetVerts[tet.x]));
+				const PxVec3 b = aToB.transform(PxLoad3(tetVerts[tet.y]));
+				const PxVec3 c = aToB.transform(PxLoad3(tetVerts[tet.z]));
+				const PxVec3 d = aToB.transform(PxLoad3(tetVerts[tet.w]));
+				
+				PxVec3 centroid = (a + b + c + d) * 0.25f;
+
+				PxReal distInSDFSpace1 = PxSdfDistance(sdfMesh1, shape2Vertex(centroid, scale1.scale, scale1.rotation));
+				//PxReal boundingSphereRadius = PxSqrt(PxMax((v0 - centroid).magnitudeSquared(), (v1 - centroid).magnitudeSquared()));
+
+				PxReal boundingSphereRadius = PxSqrt(PxMax(PxMax((a - centroid).magnitudeSquared(), (b - centroid).magnitudeSquared()),
+					PxMax((c - centroid).magnitudeSquared(), (d - centroid).magnitudeSquared())));
+
+				boundingSphereRadius /= PxMin(scale1.scale.x, PxMin(scale1.scale.y, scale1.scale.z));
+				addToBuffer = distInSDFSpace1 < (boundingSphereRadius + tolerance);				
+			}
+		}
+
+		__syncthreads();
+		sharedIndicesCount = addIndexToBuffer<PxU32, NbWarps, TargetCount>(addToBuffer, ind, sharedIndicesCount, sharedIndices, offset);
+	}
+
+	__syncthreads();
+
+	if (startIndex != offset)
+		startIndex = offset;
+	else
+		startIndex = nbTets;
+
+	return sharedIndicesCount;
+}
+
+
+template<PxU32 NbWarps, const PxU32 TargetCount, bool useVertices = false>
+__device__ PxU32 findInterestingTrianglesA(PxU32 mesh0NbPrimitives, const uint4* mesh0Ind, const float4* mesh0Verts,
+	const PxMeshScale& scale0, const PxMeshScale& scale1, const PxReal tolerance, const SparseSDFTexture& sdfMesh1,
+	const PxTransform& aToB, PxU32& startIndex, uint2* sharedIndices, PxU32 nbSubdivisionIndices, const uint2* sharedSubdivisionIndices)
+{
+	__shared__ PxU32 offset;
+
+	if (threadIdx.x == 0)
+		offset = startIndex;
+
+	const PxU32 nbPrimitives = mesh0NbPrimitives; //Can be the number of triangles or the number of vertices depending on the template boolean useVertices
+	PxU32 sharedIndicesCount = 0;
+	for (PxU32 i = startIndex; (i < nbPrimitives || nbSubdivisionIndices>0) && sharedIndicesCount < TargetCount; )
+	{
+		PxU32 ind;
+		PxU32 subdivisionLevel = 0;
+		if (threadIdx.x < 4 * nbSubdivisionIndices)
+		{
+			uint2 v = sharedSubdivisionIndices[threadIdx.x >> 2];
+			ind = v.x;
+			subdivisionLevel = v.y;
+			subdivisionLevel = Gu::elevateSubdivisionId(subdivisionLevel, threadIdx.x & 3);
+		}
+		else
+		{
+			ind = i + threadIdx.x - PxMin(TargetCount, 4 * nbSubdivisionIndices);
+		}
+		i += blockDim.x - PxMin(TargetCount, 4 * nbSubdivisionIndices);
+		nbSubdivisionIndices = 0; //It is guaranteed that all subdivision triangles get consumed in the first run because of the limited size of sharedSubdivisionIndices
+
+
+		bool addToBuffer = false;
+		if (ind < nbPrimitives)
+		{
+			if (useVertices)
+			{
+				PxVec3 p = getVertex(mesh0Verts, scale0, aToB, ind);
+				PxReal distInSDFSpace1 = PxSdfDistance(sdfMesh1, shape2Vertex(p, scale1.scale, scale1.rotation));
+				addToBuffer = distInSDFSpace1 < tolerance;
+			}
+			else 
+			{
+				PxVec3 v0, v1, v2;
+				getTriangleVertices(mesh0Ind, mesh0Verts, scale0, aToB, ind, subdivisionLevel, v0, v1, v2);
+
+				const PxReal third = 1.f / 3.f;
+				PxVec3 centroid = (v0 + v1 + v2) * third;
+
+				PxReal distInSDFSpace1 = PxSdfDistance(sdfMesh1, shape2Vertex(centroid, scale1.scale, scale1.rotation));
+				PxReal boundingSphereRadius = PxSqrt(PxMax((v0 - centroid).magnitudeSquared(), PxMax((v1 - centroid).magnitudeSquared(),
+					(v2 - centroid).magnitudeSquared())));
+				boundingSphereRadius /= PxMin(scale1.scale.x, PxMin(scale1.scale.y, scale1.scale.z));
+				addToBuffer = distInSDFSpace1 < (boundingSphereRadius + tolerance);
+			}
+		}
+
+		__syncthreads();
+		sharedIndicesCount = addIndexToBuffer<uint2, NbWarps, TargetCount>(addToBuffer, make_uint2(ind, subdivisionLevel), sharedIndicesCount, sharedIndices, offset);
+	}
+
+	__syncthreads();
+
+	if (startIndex != offset)
+		startIndex = offset;
+	else
+		startIndex = nbPrimitives;
+
+	return sharedIndicesCount;
+}
+
+
+template<PxU32 NbWarps, const PxU32 TargetCount>
+__device__ PxU32 findInterestingVertices(PxU32 nbVertices, const float4* positions,
+	const PxMeshScale& scale1, const PxReal tolerance, const SparseSDFTexture& sdfMesh1,
+	const PxTransform& aToB, PxU32& startIndex, PxU32* sharedIndices)
+{
+	__shared__ PxU32 offset;
+
+	if (threadIdx.x == 0)
+		offset = startIndex;
+
+	PxU32 sharedIndicesCount = 0;
+	for (PxU32 i = startIndex; (i < nbVertices); )
+	{
+		PxU32 ind = i + threadIdx.x;
+		i += blockDim.x;
+
+		bool addToBuffer = false;
+		if (ind < nbVertices)
+		{
+			PxVec3 v = aToB.transform(PxLoad3(positions[ind]));
+			PxReal distInSDFSpace1 = PxSdfDistance(sdfMesh1, shape2Vertex(v, scale1.scale, scale1.rotation));	
+			addToBuffer = distInSDFSpace1 < tolerance;			
+		}
+
+		__syncthreads();
+		sharedIndicesCount = addIndexToBuffer<PxU32, NbWarps, TargetCount>(addToBuffer, ind, sharedIndicesCount, sharedIndices, offset);
+	}
+
+	__syncthreads();
+
+	if (startIndex != offset)
+		startIndex = offset;
+	else
+		startIndex = nbVertices;
+
+	return sharedIndicesCount;
+}
+
+
+PX_FORCE_INLINE __device__ PxReal triangleRadiusSquared(const PxVec3& v0, const PxVec3& v1, const PxVec3& v2)
+{
+	const PxReal third = 1.0f / 3.0f;
+	PxVec3 center = third * (v0 + v1 + v2);
+	return PxMax(PxMax((v1 - v0).magnitudeSquared(), (v2 - v1).magnitudeSquared()), (v0 - v2).magnitudeSquared());
+}
+
+PX_FORCE_INLINE __device__ PxReal sdfRadiusSquared(SparseSDFTexture& sdfTexture)
+{
+	return (sdfTexture.sdfBoxHigher - sdfTexture.sdfBoxLower).magnitudeSquared();
+}
+
+//TargetCount is actually the maximal size of sharedSubdivisionIndices. The naming could be improved...
+template<PxU32 NbWarps, const PxU32 TargetCount>
+__device__ PxU32 addToRefinementBuffer(bool addToBuffer, PxU32 ind, PxU32 subInd, PxU32 nbElementsInBuffer, uint2* sharedMemoryBuffer, PxU32 maxRefinementLevel)
+{
+	PxU32 subdivisionLevel, subTriangleIndex;
+	Gu::decodeSubdivisionId(subInd, subdivisionLevel, subTriangleIndex);
+	addToBuffer = addToBuffer && subdivisionLevel < maxRefinementLevel; //Make sure a upper limit of refinement is never exceeded
+
+	PxU32 total;
+	PxU32 exclusiveSum = threadBlockScanExclusive<NbWarps>(addToBuffer, total);
+
+	if (addToBuffer)
+	{
+		PxU32 writeOffset = nbElementsInBuffer + exclusiveSum;
+
+		if (writeOffset < TargetCount)
+		{
+			sharedMemoryBuffer[writeOffset] = make_uint2(ind, subInd);
+		}
+	}
+
+	return min(total + nbElementsInBuffer, TargetCount);
+}
+}
+#endif
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/softbodyHFMidPhase.cu
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/softbodyHFMidPhase.cu
@@ -0,0 +1,418 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. 
+
+#include "PxsTransformCache.h"
+#include "PxsContactManagerState.h"
+#include "foundation/PxBounds3.h"
+#include "PxgContactManager.h"
+#include "PxgSoftBody.h"
+#include "geometry/PxGeometry.h"
+#include "PxsMaterialCore.h"
+#include "cudaNpCommon.h"
+
+#include "PxgCommonDefines.h"
+#include "heightfieldUtil.cuh"
+#include "utils.cuh"
+#include "sphereTriangle.cuh"
+#include "deformableCollision.cuh"
+#include "assert.h"
+#include "reduction.cuh"
+#include <stdio.h>
+#include "PxgSoftBodyCoreKernelIndices.h"
+#include "PxgNpKernelIndices.h"
+
+using namespace physx;
+
+extern "C" __host__ void initNarrowphaseKernels16() {}
+
+struct sbHeightfieldScratch
+{
+
+	const float4 * PX_RESTRICT tetmeshVerts;
+	const uint4 * PX_RESTRICT tetmeshTetIndices;
+	const PxU8*	PX_RESTRICT tetmeshTetSurfaceHint;
+	PxU32 numTets;
+
+	PxTransform heightfieldTransform;
+	PxgShape heightfieldShape;
+
+}; PX_COMPILE_TIME_ASSERT(sizeof(sbHeightfieldScratch) <= 1024);
+
+
+//__device__ static inline PxU32 sbHeightfieldComputePairs(
+//	const PxU32 minColumn,
+//	const PxU32 maxColumn,
+//	const PxU32 minRow,
+//	const PxU32 maxRow,
+//	const PxU32 nbRows,
+//	const PxU32 nbCols,
+//	PxHeightFieldSample* samples,
+//	const PxU32 miny,
+//	const PxU32 maxy
+//)
+//{
+//
+//	PxU32 nbPairs = 0;
+//
+//	const PxU32 columnSpan = maxColumn - minColumn;
+//
+//	//we have two materials corresponding to one vertexIndex, so each thread will deal with one of the materials
+//	const PxU32 totalNumProcessed = (maxRow - minRow) * columnSpan * 2;
+//
+//	for (PxU32 i = 0; i < totalNumProcessed; ++i)
+//	{
+//
+//		const PxU32 index = i / 2;
+//		const PxU32 vertexIndex = (minRow + index / columnSpan) * nbCols + (minColumn + index % columnSpan);
+//		assert(isValidVertex(vertexIndex, nbRows, nbCols));
+//		PxReal h0 = getHeight(vertexIndex, samples);
+//		PxReal h1 = getHeight(vertexIndex + 1, samples);
+//		PxReal h2 = getHeight(vertexIndex + nbCols, samples);
+//		PxReal h3 = getHeight(vertexIndex + nbCols + 1, samples);
+//		const bool con0 = maxy < h0 && maxy < h1 && maxy < h2 && maxy < h3;
+//		const bool con1 = miny > h0 && miny > h1 && miny > h2 && miny > h3;
+//
+//		if (!(con0 || con1))
+//		{
+//			const PxHeightFieldSample& sample = getSample(vertexIndex, samples);
+//
+//			const bool isMaterial1 = (i & 1) ? 1 : 0;
+//			PxU32 material = isMaterial1 ? sample.materialIndex1 : sample.materialIndex0;
+//			if (material != PxHeightFieldMaterial::eHOLE)
+//				nbPairs++;
+//		}
+//	}//end of totalNumProcessed
+//
+//	return nbPairs;
+//}
+
+//__device__ static inline void sbHeightfieldOutputPairs(
+//	const PxU32 minColumn,
+//	const PxU32 maxColumn,
+//	const PxU32 minRow,
+//	const PxU32 maxRow,
+//	const PxU32 nbRows,
+//	const PxU32 nbCols,
+//	PxHeightFieldSample* samples,
+//	const PxU32 miny,
+//	const PxU32 maxy,
+//
+//	const PxU32 cmInd,
+//	const PxU32 tetrahedronInd,
+//	const PxU32 startOffset,
+//
+//	uint4* stackPtr						//output
+//
+//)
+//{
+//	PxU32 pairCount = 0;
+//	const PxU32 columnSpan = maxColumn - minColumn;
+//
+//	//we have two materials corresponding to one vertexIndex, so each thread will deal with one of the materials
+//	const PxU32 totalNumProcessed = (maxRow - minRow) * columnSpan * 2;
+//
+//	for (PxU32 i = 0; i < totalNumProcessed; ++i)
+//	{
+//
+//		PxU32 triangleIdx = 0xFFffFFff;
+//
+//		const PxU32 index = i / 2;
+//		const PxU32 vertexIndex = (minRow + index / columnSpan) * nbCols + (minColumn + index % columnSpan);
+//		assert(isValidVertex(vertexIndex, nbRows, nbCols));
+//		PxReal h0 = getHeight(vertexIndex, samples);
+//		PxReal h1 = getHeight(vertexIndex + 1, samples);
+//		PxReal h2 = getHeight(vertexIndex + nbCols, samples);
+//		PxReal h3 = getHeight(vertexIndex + nbCols + 1, samples);
+//		const bool con0 = maxy < h0 && maxy < h1 && maxy < h2 && maxy < h3;
+//		const bool con1 = miny > h0 && miny > h1 && miny > h2 && miny > h3;
+//
+//		if (!(con0 || con1))
+//		{
+//			const PxHeightFieldSample& sample = getSample(vertexIndex, samples);
+//
+//			const bool isMaterial1 = (i & 1) ? 1 : 0;
+//			PxU32 material = isMaterial1 ? sample.materialIndex1 : sample.materialIndex0;
+//			if (material != PxHeightFieldMaterial::eHOLE)
+//			{
+//				triangleIdx = isMaterial1 ? ((vertexIndex << 1) + 1) : (vertexIndex << 1);
+//
+//				stackPtr[startOffset + pairCount] = make_uint4(cmInd, tetrahedronInd, triangleIdx, 0);
+//
+//				pairCount++;
+//			}
+//		}
+//	}//end of totalNumProcessed
+//}
+
+__device__ static inline void sbHeightfieldMidphaseCore(
+	sbHeightfieldScratch* scratch,
+	const PxU32 cmIdx,
+	const PxBounds3 tetBound,
+	const PxU32 tetrahedronIdx,
+	const PxU32 workIndex,
+	const PxU8 hint,
+	const PxU32 numTets,
+	const PxU32 stackSize,
+	uint4* PX_RESTRICT stackPtr,							//output
+	PxU32* PX_RESTRICT midphasePairsNum					//output
+)
+{
+	PxU32 nbPairs = 0;
+	PxBounds3 localBound;
+	PxU32 nbRows = 0;
+	PxU32 nbCols = 0;
+	PxHeightFieldSample* samples = NULL;
+	PxU32 minRow = 0;
+	PxU32 maxRow = 0;
+	PxU32 minColumn = 0;
+	PxU32 maxColumn = 0;
+
+	if (hint && tetrahedronIdx < numTets)
+	{
+		PxTransform heightfieldTransform = scratch->heightfieldTransform;
+
+		PxgShape heightfieldShape = scratch->heightfieldShape;
+		const PxReal oneOverHeightScale = 1.f / heightfieldShape.scale.scale.y;
+		const PxReal oneOverRowScale = 1.f / PxAbs(heightfieldShape.scale.scale.x);
+		const PxReal oneOverlColScale = 1.f / PxAbs(heightfieldShape.scale.scale.z);
+
+		//bound is in world space, we need to transform the bound to the local space of height field
+		localBound = PxBounds3::transformFast(heightfieldTransform.getInverse(), tetBound);
+
+		localBound.minimum.x *= oneOverRowScale;
+		localBound.minimum.y *= oneOverHeightScale;
+		localBound.minimum.z *= oneOverlColScale;
+
+		localBound.maximum.x *= oneOverRowScale;
+		localBound.maximum.y *= oneOverHeightScale;
+		localBound.maximum.z *= oneOverlColScale;
+
+		//row scale
+		if (heightfieldShape.scale.scale.x < 0.f)
+		{
+			//swap min and max row scale
+			const PxReal temp = localBound.minimum.x;
+			localBound.minimum.x = localBound.maximum.x;
+			localBound.maximum.x = temp;
+		}
+
+		//col scale
+		if (heightfieldShape.scale.scale.z < 0.f)
+		{
+			PxReal swap = localBound.minimum.z;
+			localBound.minimum.z = localBound.maximum.z;
+			localBound.maximum.z = swap;
+		}
+
+		PxU32* heightfieldData = reinterpret_cast<PxU32*>(heightfieldShape.hullOrMeshPtr);
+		nbRows = heightfieldData[0];
+		nbCols = heightfieldData[1];
+		samples = reinterpret_cast<PxHeightFieldSample*>(&heightfieldData[2]);
+
+
+		if (!(localBound.minimum.x > nbRows - 1) || (localBound.minimum.z > nbCols - 1)
+			|| (localBound.maximum.x < 0) || (localBound.maximum.z < 0))
+		{
+			minRow = getMinRow(localBound.minimum.x, nbRows);
+			maxRow = getMaxRow(localBound.maximum.x, nbRows);
+			minColumn = getMinColumn(localBound.minimum.z, nbCols);
+			maxColumn = getMaxColumn(localBound.maximum.z, nbCols);
+
+			if ((2 * (maxColumn - minColumn) * (maxRow - minRow)) > 0)
+			{
+				nbPairs = heightfieldComputePairs(minColumn, maxColumn, minRow, maxRow, nbRows, nbCols,
+					samples, localBound.minimum.y, localBound.maximum.y);
+			}
+		}
+	}
+
+	
+
+	PxU32 pairOffset = warpScan<AddOpPxU32, PxU32>(FULL_MASK, nbPairs) - nbPairs;
+
+	PxU32 validCount = 0;
+	if (threadIdx.x == (WARP_SIZE - 1))
+	{
+		validCount = pairOffset + nbPairs;
+	}
+	validCount = __shfl_sync(FULL_MASK, validCount, WARP_SIZE - 1);
+
+
+	PxU32 startIndex = 0xffffffff;
+	if (threadIdx.x == 0 && validCount > 0)
+	{
+		startIndex = atomicAdd(midphasePairsNum, validCount);
+	}
+
+	startIndex = __shfl_sync(FULL_MASK, startIndex, 0);
+
+	if (startIndex != 0xffffffff)
+	{
+		heightfieldOutputPairs(minColumn, maxColumn, minRow, maxRow, nbRows, nbCols,
+			samples, localBound.minimum.y, localBound.maximum.y, cmIdx, tetrahedronIdx, startIndex + pairOffset, stackSize, stackPtr);
+	}
+}
+
+extern "C" __global__ void sb_heightfieldMidphaseGeneratePairsLaunch(
+	const PxU32									numNPWorkItems,
+	const PxReal								toleranceLength,
+	const PxgContactManagerInput* PX_RESTRICT	cmInputs,
+	const PxsCachedTransform* PX_RESTRICT		transformCache,
+	const PxReal* PX_RESTRICT					contactDistance,
+	const PxgShape* PX_RESTRICT					gpuShapes,
+
+	PxgSoftBody* PX_RESTRICT					softbodies,
+	const PxU32									stackSizeBytes,
+
+	PxU8*										stackPtr,							//output
+	PxU32*										midphasePairsNum					//output
+)
+{
+
+	__shared__ PxU32 scratchMem[1024];
+
+	sbHeightfieldScratch* s_warpScratch = reinterpret_cast<sbHeightfieldScratch*>(scratchMem);
+
+	const PxU32 nbWarpsPerCm = 64; // 64 warp(2048 threads) to deal with one pair
+	const PxU32 nbWarpsRequired = nbWarpsPerCm * numNPWorkItems;
+
+	const PxU32 nbThreadsPerCm = 64 * WARP_SIZE;
+
+	const PxU32 numWarpPerBlock = MIDPHASE_WARPS_PER_BLOCK;
+	const PxU32 totalNumWarps = PxgSoftBodyKernelGridDim::SB_SBMIDPHASE * numWarpPerBlock;
+
+	const PxU32 warpIndex = threadIdx.y;
+
+	const PxU32 nbIterationsPerWarps = (nbWarpsRequired + totalNumWarps - 1) / totalNumWarps;
+
+	/*if (warpIndex == 0 && threadIdx.x == 0 && blockIdx.x == 0)
+	{
+		printf("nbIterationsPerWarps %i nbWarpsRequired %i\n", nbIterationsPerWarps, nbWarpsRequired);
+	}*/
+	
+	for (PxU32 i = 0; i < nbIterationsPerWarps; ++i)
+	{
+		const PxU32 workIndex = i + (warpIndex + numWarpPerBlock * blockIdx.x) * nbIterationsPerWarps;
+
+		if (workIndex < nbWarpsRequired)
+		{
+			unsigned int cmIdx = workIndex / nbWarpsPerCm;
+
+			PxgShape softbodyShape, heightfieldShape;
+			PxU32 softbodyCacheRef, heightfieldCacheRef;
+			LoadShapePairWarp<PxGeometryType::eTETRAHEDRONMESH, PxGeometryType::eHEIGHTFIELD>(cmInputs, cmIdx, gpuShapes,
+				softbodyShape, softbodyCacheRef, heightfieldShape, heightfieldCacheRef);
+
+
+			PxsCachedTransform heightfieldTransformCache;
+			PxsCachedTransform_ReadWarp(heightfieldTransformCache, transformCache + heightfieldCacheRef);
+
+			const PxgSoftBody& softbody = softbodies[softbodyShape.particleOrSoftbodyId];
+
+			PxU8* meshGeomPtr;
+			meshGeomPtr = reinterpret_cast<PxU8 *>(softbody.mTetMeshData);
+		
+			const PxU32 idx = threadIdx.x;
+
+			if (idx == 0)
+			{
+				s_warpScratch->tetmeshVerts = softbody.mPosition_InvMass;
+				s_warpScratch->tetmeshTetIndices = softbody.mTetIndices;
+				s_warpScratch->tetmeshTetSurfaceHint = softbody.mTetMeshSurfaceHint;
+
+				const uint4 nbVerts_nbPrimitives_maxDepth_nbBv32TreeNodes = *reinterpret_cast<const uint4 *>(meshGeomPtr);
+				s_warpScratch->numTets = nbVerts_nbPrimitives_maxDepth_nbBv32TreeNodes.y;
+
+				meshGeomPtr += sizeof(uint4) + sizeof(const Gu::BV32DataPacked)* nbVerts_nbPrimitives_maxDepth_nbBv32TreeNodes.w 
+					+sizeof(const Gu::BV32DataDepthInfo) * nbVerts_nbPrimitives_maxDepth_nbBv32TreeNodes.z
+					+ sizeof(PxU32) * nbVerts_nbPrimitives_maxDepth_nbBv32TreeNodes.w;
+				
+				/*const PxU8* surfaceHint = reinterpret_cast<const PxU8*>(meshGeomPtr);
+				s_warpScratch->tetmeshTetSurfaceHint = surfaceHint;*/
+
+				s_warpScratch->heightfieldTransform = heightfieldTransformCache.transform;
+				s_warpScratch->heightfieldShape = heightfieldShape;
+			}
+
+			__syncwarp();
+
+			const PxReal contactDist = contactDistance[softbodyCacheRef] + contactDistance[heightfieldCacheRef];
+
+			const PxU32 nbTets = s_warpScratch->numTets;
+
+			const PxU32 nbIter = (nbTets + nbThreadsPerCm - 1) / nbThreadsPerCm;
+
+			const uint4* tetIndices = s_warpScratch->tetmeshTetIndices;
+			const PxU8* surfaceHint = s_warpScratch->tetmeshTetSurfaceHint;
+
+			for (PxU32 j = 0; j < nbIter; ++j)
+			{
+				const PxU32 wrappedThreadIndex = (workIndex * WARP_SIZE + threadIdx.x) % nbThreadsPerCm;
+
+				const PxU32 tetrahedronIdx = wrappedThreadIndex + nbThreadsPerCm * j;
+
+				PxBounds3 tetBound = PxBounds3::empty();
+				PxU8 hint = 1;
+				if (tetrahedronIdx < nbTets)
+				{
+					//avoid generate contacts if the tetrahedron isn't a surface tetrahedron
+					hint = surfaceHint[tetrahedronIdx];
+
+					if (hint)
+					{
+						const uint4 tetIdx = tetIndices[tetrahedronIdx];
+						const PxVec3 worldV0 = PxLoad3(s_warpScratch->tetmeshVerts[tetIdx.x]);
+						const PxVec3 worldV1 = PxLoad3(s_warpScratch->tetmeshVerts[tetIdx.y]);
+						const PxVec3 worldV2 = PxLoad3(s_warpScratch->tetmeshVerts[tetIdx.z]);
+						const PxVec3 worldV3 = PxLoad3(s_warpScratch->tetmeshVerts[tetIdx.w]);
+
+						tetBound = tetBoundingBox(worldV0, worldV1, worldV2, worldV3);
+
+						tetBound.fattenFast(contactDist);
+					}
+				}
+
+				uint4* tStackPtr = reinterpret_cast<uint4*>(stackPtr);
+				const PxU32 stackSize = stackSizeBytes / sizeof(uint4);
+
+				sbHeightfieldMidphaseCore(
+					s_warpScratch,
+					cmIdx,
+					tetBound,
+					tetrahedronIdx,
+					workIndex,
+					hint,
+					nbTets,
+					stackSize,
+					tStackPtr,
+					midphasePairsNum
+				);
+			}
+		}
+	}
+}
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/softbodyMidPhase.cu
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/softbodyMidPhase.cu
@@ -0,0 +1,526 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved. 
+
+#include "foundation/PxBasicTemplates.h"
+#include "foundation/PxBounds3.h"
+#include "foundation/PxMath.h"
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxVec3.h"
+
+#include "geometry/PxGeometry.h"
+
+#include "GuBV32.h"
+#include "cudaNpCommon.h"
+
+#include "PxgContactManager.h"
+#include "PxgConvexConvexShape.h"
+#include "PxgFEMCloth.h"
+#include "PxgNpKernelIndices.h"
+#include "PxgParticleSystem.h"
+#include "PxgSoftBody.h"
+
+#include "PxsTransformCache.h"
+
+#include <vector_types.h>
+
+#include "copy.cuh"
+#include "dataReadWriteHelper.cuh"
+#include "femMidphaseScratch.cuh"
+#include "utils.cuh"
+#include "sbMidphaseScratch.cuh"
+#include "deformableCollision.cuh"
+
+#include "bv32Traversal.cuh"
+
+using namespace physx;
+
+extern "C" __host__ void initNarrowphaseKernels13() {}
+
+struct SoftbodyTreeTraverser
+{
+	sbMidphaseScratch* s_warpScratch;
+	const PxBounds3& mBox;
+	const PxU32 stackSize;
+	uint4* PX_RESTRICT stackPtr;
+	PxU32* PX_RESTRICT midphasePairsNum;
+	unsigned int cmIdx;
+
+	PX_FORCE_INLINE __device__ SoftbodyTreeTraverser(
+		sbMidphaseScratch* s_warpScratch,
+		PxBounds3& aabb,
+		const PxU32 stackSize,
+		uint4* PX_RESTRICT stackPtr,
+		PxU32* PX_RESTRICT midphasePairsNum,
+		unsigned int cmIdx)
+		: s_warpScratch(s_warpScratch), mBox(aabb), stackSize(stackSize), stackPtr(stackPtr), midphasePairsNum(midphasePairsNum), cmIdx(cmIdx)
+	{ }
+
+	PX_FORCE_INLINE __device__ void intersectPrimitiveFullWarp(PxU32 primitiveIndex, PxU32 idxInWarp) const
+	{
+		bool intersect = false;
+		if (primitiveIndex != 0xFFFFFFFF) 
+		{
+			const PxU8 hint = s_warpScratch->tetmeshSurfaceHint[primitiveIndex];
+
+			if (hint)
+			{
+				uint4 tetIdx = s_warpScratch->tetmeshTetIndices[primitiveIndex];
+
+				const PxVec3 worldV0 = PxLoad3(s_warpScratch->tetmeshVerts[tetIdx.x]);
+				const PxVec3 worldV1 = PxLoad3(s_warpScratch->tetmeshVerts[tetIdx.y]);
+				const PxVec3 worldV2 = PxLoad3(s_warpScratch->tetmeshVerts[tetIdx.z]);
+				const PxVec3 worldV3 = PxLoad3(s_warpScratch->tetmeshVerts[tetIdx.w]);
+
+				if (isValidTet(worldV0, worldV1, worldV2, worldV3))
+				{
+					intersect = tetBoundingBox(worldV0, worldV1, worldV2, worldV3).intersects(mBox);
+				}
+			}			
+		}
+
+		pushOntoStackFullWarp(intersect, midphasePairsNum, stackPtr, stackSize, make_uint4(cmIdx, primitiveIndex, 0, 0));
+	}
+
+	PX_FORCE_INLINE __device__ bool intersectBoxFullWarp(bool hasBox, const PxVec3& min, const PxVec3& max) const
+	{
+		if (hasBox)
+			return mBox.intersects(PxBounds3(min, max));
+		else
+			return false;
+	}
+};
+
+
+template<unsigned int WarpsPerBlock>
+__device__ static inline void sbMidphaseCore(
+	PxU32 numNPWorkItems,
+	const PxReal toleranceLength,
+	const PxgContactManagerInput* PX_RESTRICT cmInputs,
+	const PxsCachedTransform* PX_RESTRICT transformCache,
+	const PxBounds3* PX_RESTRICT bounds,
+	const PxReal* PX_RESTRICT contactDistance,
+	const PxgShape* PX_RESTRICT gpuShapes,
+	const PxgSoftBody* PX_RESTRICT softbodies,
+
+	const PxU32 stackSizeBytes,
+	PxU8* PX_RESTRICT stackPtr,									//output
+	PxU32* PX_RESTRICT midphasePairsNum,						//output
+
+	sbMidphaseScratch*	s_warpScratch
+)
+{
+	//wrap index
+	const unsigned int warpIdx = threadIdx.y;
+	unsigned int cmIdx = warpIdx + blockIdx.x * blockDim.y;
+
+	//unsigned mask_cmIdx = __ballot_sync(FULL_MASK, cmIdx < numNPWorkItems);
+	if (cmIdx < numNPWorkItems)
+	{
+		PxgShape softbodyShape, rigidShape;
+		PxU32 softbodyCacheRef, rigidCacheRef;
+		LoadShapePairWarp<PxGeometryType::eTETRAHEDRONMESH>(cmInputs, cmIdx, gpuShapes,
+			softbodyShape, softbodyCacheRef, rigidShape, rigidCacheRef);
+
+		const PxU32 softbodyId = softbodyShape.particleOrSoftbodyId;
+		const PxgSoftBody& softbody = softbodies[softbodyId];
+
+		PxU8 * tetmeshGeomPtr = reinterpret_cast<PxU8 *>(softbody.mTetMeshData);
+
+		const uint4 nbVerts_nbTets_maxDepth_nbBv32TreeNodes = *reinterpret_cast<const uint4 *>(tetmeshGeomPtr);
+		tetmeshGeomPtr += sizeof(uint4);
+
+		if (threadIdx.x == 0)
+		{
+			s_warpScratch->tetmeshVerts = softbody.mPosition_InvMass;
+			s_warpScratch->tetmeshTetIndices = softbody.mTetIndices;
+			s_warpScratch->tetmeshSurfaceHint = softbody.mTetMeshSurfaceHint;
+			
+			//const PxU32 & numVerts = nbVerts_nbTets_maxDepth_nbBv32TreeNodes.x;
+			//const PxU32 & numTets = nbVerts_nbTets_maxDepth_nbBv32TreeNodes.y;
+			//const PxU32 & maxDepth = nbVerts_nbTets_maxDepth_nbBv32TreeNodes.z;
+			const PxU32 & nbBv32PackedNodes = nbVerts_nbTets_maxDepth_nbBv32TreeNodes.w;
+
+			//printf("maxDepth %i numVerts %i numTets %i nbBv32TreeNodes %i\n", maxDepth, numVerts, numTets, nbBv32TreeNodes);
+
+			Gu::BV32DataPacked* bv32PackedNodes = reinterpret_cast<Gu::BV32DataPacked*>(tetmeshGeomPtr);
+			s_warpScratch->bv32PackedNodes = bv32PackedNodes;
+
+			tetmeshGeomPtr += sizeof(const Gu::BV32DataPacked)* nbBv32PackedNodes
+				+ sizeof(const Gu::BV32DataDepthInfo) * nbVerts_nbTets_maxDepth_nbBv32TreeNodes.z
+				+ sizeof(PxU32) * nbVerts_nbTets_maxDepth_nbBv32TreeNodes.w;
+		}
+
+		__syncwarp();
+
+
+		//ML: this is the world bound AABB
+		PxBounds3 aabb;
+		PxBounds3_ReadWarp(aabb, bounds + rigidCacheRef);
+		//PxReal contactDist = convexShape.contactOffset + trimeshShape.contactOffset;
+		const PxReal contactDist = contactDistance[softbodyCacheRef] + contactDistance[rigidCacheRef];
+		aabb.fattenFast(contactDist);
+
+		uint4* tStackPtr = reinterpret_cast<uint4*>(stackPtr);
+		const PxU32 stackSize = stackSizeBytes / sizeof(uint4);
+		bv32TreeTraversal<const SoftbodyTreeTraverser, WarpsPerBlock>(s_warpScratch->bv32PackedNodes, s_warpScratch->sBv32Nodes, SoftbodyTreeTraverser(s_warpScratch, aabb, stackSize, tStackPtr, midphasePairsNum, cmIdx));
+
+		/*if (threadIdx.x == 0 && nbPairsPerCM > 0)
+		printf("nbPairsPerCM %i\n", nbPairsPerCM);*/
+	}
+}
+
+extern "C" __global__
+//__launch_bounds__(MIDPHASE_WARPS_PER_BLOCK * WARP_SIZE, 4)
+void sb_midphaseGeneratePairsLaunch(
+	PxU32 numWorkItems,
+	const PxReal toleranceLength,
+	const PxgContactManagerInput* PX_RESTRICT cmInputs,
+	const PxsCachedTransform* PX_RESTRICT transformCache,
+	const PxBounds3* PX_RESTRICT bounds,
+	const PxReal* PX_RESTRICT contactDistance,
+	const PxgShape* PX_RESTRICT gpuShapes,
+	const PxgSoftBody* PX_RESTRICT softbodies,
+	
+	const PxU32 stackSizeBytes,
+	PxU8* PX_RESTRICT stackPtr,							//output
+	PxU32* PX_RESTRICT midphasePairsNum					//output
+)
+{
+
+	__shared__ PxU32 scratchMem[MIDPHASE_WARPS_PER_BLOCK][WARP_SIZE * 7];
+
+	sbMidphaseCore<MIDPHASE_WARPS_PER_BLOCK>(
+		numWorkItems,
+		toleranceLength,
+		cmInputs,
+		transformCache,
+		bounds,
+		contactDistance,
+		gpuShapes,
+		softbodies,
+		stackSizeBytes,
+		stackPtr,
+		midphasePairsNum,
+		(sbMidphaseScratch*)scratchMem[threadIdx.y]
+		);
+}
+
+//struct sbParticleMidphaseScratch
+//{
+//	const float4 * PX_RESTRICT tetmeshVerts;
+//	const uint4 * PX_RESTRICT tetmeshTetIndices;
+//	const PxU8* PX_RESTRICT tetmeshSurfaceHint;
+//
+//	const Gu::BV32DataDepthInfo* PX_RESTRICT bv32DepthInfo;
+//	const PxU32* PX_RESTRICT bv32RemapPackedNodeIndex;
+//	//bv32 tree
+//	Gu::BV32DataPacked* bv32PackedNodes;
+//	int nbPackedNodes;
+//	//statck for traversal
+//	int sBv32Nodes[192]; //6 depth of the bv32 tree
+//	PxU32 sBv32ActiveParticles[192];
+//};
+//PX_COMPILE_TIME_ASSERT(sizeof(sbParticleMidphaseScratch) <= WARP_SIZE * 15 * sizeof(PxU32));
+
+
+
+
+struct SoftbodyBoxTraverser
+{
+	femMidphaseScratch* PX_RESTRICT s_warpScratch;
+	const PxBounds3 mBox;
+	const PxU32 queryIndex;
+	const PxU32 cmIdx;
+	const PxU32 stackSize;
+	uint4* PX_RESTRICT stackPtr;					//output
+	PxU32* midphasePairs;						   //output
+
+	PX_FORCE_INLINE __device__ SoftbodyBoxTraverser(
+		femMidphaseScratch* PX_RESTRICT s_warpScratch,
+		const PxVec3 boxMin,
+		const PxVec3 boxMax,
+		const PxU32 queryIndex,
+		const PxU32 cmIdx,
+		const PxU32 stackSize,
+		uint4* PX_RESTRICT stackPtr,					//output
+		PxU32* midphasePairs)							//output
+		: s_warpScratch(s_warpScratch), mBox(boxMin, boxMax), queryIndex(queryIndex), cmIdx(cmIdx),
+		stackSize(stackSize), stackPtr(stackPtr), midphasePairs(midphasePairs)
+	{ }
+
+	PX_FORCE_INLINE __device__ void intersectPrimitiveFullWarp(PxU32 primitiveIndex, PxU32 idxInWarp) const
+	{
+		bool intersect = false;
+		if (primitiveIndex != 0xFFFFFFFF) 
+		{
+			uint4 tetIdx = s_warpScratch->meshVertsIndices[primitiveIndex];
+
+			const PxVec3 worldV0 = PxLoad3(s_warpScratch->meshVerts[tetIdx.x]);
+			const PxVec3 worldV1 = PxLoad3(s_warpScratch->meshVerts[tetIdx.y]);
+			const PxVec3 worldV2 = PxLoad3(s_warpScratch->meshVerts[tetIdx.z]);
+			const PxVec3 worldV3 = PxLoad3(s_warpScratch->meshVerts[tetIdx.w]);
+
+			if (isValidTet(worldV0, worldV1, worldV2, worldV3))
+			{
+				intersect = tetBoundingBox(worldV0, worldV1, worldV2, worldV3).intersects(mBox);
+			}
+		}
+
+		pushOntoStackFullWarp(intersect, midphasePairs, stackPtr, stackSize, make_uint4(cmIdx, queryIndex, primitiveIndex, 0));
+	}
+
+	PX_FORCE_INLINE __device__ bool intersectBoxFullWarp(bool hasBox, const PxVec3& min, const PxVec3& max) const
+	{
+		if (hasBox)
+			return mBox.intersects(PxBounds3(min, max));
+		else
+			return false;
+	}
+};
+
+extern "C" __global__
+//__launch_bounds__(MIDPHASE_WARPS_PER_BLOCK * WARP_SIZE, 4)
+void sb_psMidphaseGeneratePairsLaunch(
+	PxU32 numWorkItems,
+	const PxReal toleranceLength,
+	const PxgContactManagerInput* PX_RESTRICT cmInputs,
+	const PxgShape* PX_RESTRICT gpuShapes,
+	const PxsCachedTransform* PX_RESTRICT transformCache,
+	const PxBounds3* PX_RESTRICT bounds,
+	const PxReal* PX_RESTRICT contactDistance,
+	const PxgSoftBody* PX_RESTRICT softbodies,
+	PxgParticleSystem* PX_RESTRICT particleSystems,
+
+	const PxU32 stackSizeBytes,
+	PxU8* PX_RESTRICT stackPtr,							//output
+	PxU32* PX_RESTRICT midphasePairsNum					//output
+)
+{
+	__shared__ __align__(16) PxU8 particleSystemMemory[sizeof(PxgParticleSystem)];
+	PxgParticleSystem& sParticleSystem = *(reinterpret_cast<PxgParticleSystem*>(particleSystemMemory));
+	__shared__ femMidphaseScratch scratchMem[MIDPHASE_WARPS_PER_BLOCK];
+
+	femMidphaseScratch* s_warpScratch = &scratchMem[threadIdx.y];
+
+	if (blockIdx.y < numWorkItems)
+	{
+		//const PxU32 globalThreadIdx = threadIdx.y * WARP_SIZE + threadIdx.x + blockIdx.x*blockDim.x*blockDim.y;
+		const PxU32 globalWarpIdx = threadIdx.y + blockIdx.x*blockDim.y;
+		
+		PxgShape particleShape, softbodyShape;
+		PxU32 particleCacheRef, softbodyCacheRef;
+		LoadShapePairWarp<PxGeometryType::ePARTICLESYSTEM, PxGeometryType::eTETRAHEDRONMESH>(cmInputs, blockIdx.y, gpuShapes,
+			particleShape, particleCacheRef, softbodyShape, softbodyCacheRef);
+
+		const PxU32 particleSystemId = particleShape.particleOrSoftbodyId;
+
+		//each thread read 16 byte
+		uint4* srcParticleSystem = reinterpret_cast<uint4*>(&particleSystems[particleSystemId]);
+		uint4* dstParticleSystem = reinterpret_cast<uint4*>(&sParticleSystem);
+		warpCopy<uint4>(dstParticleSystem, srcParticleSystem, sizeof(PxgParticleSystem));
+
+
+		const PxU32 softbodyId = softbodyShape.particleOrSoftbodyId;
+		const PxgSoftBody& softbody = softbodies[softbodyId];
+
+		PxU8 * tetmeshGeomPtr = reinterpret_cast<PxU8 *>(softbody.mTetMeshData);
+
+		//const uint4 nbVerts_nbTets_maxDepth_nbBv32TreeNodes = *reinterpret_cast<const uint4 *>(tetmeshGeomPtr);
+		tetmeshGeomPtr += sizeof(uint4);
+
+		/*if (workIndex == 0 && threadIdx.x == 0)
+		printf("particleSystemId %i softbodyId %i\n", particleSystemId, softbodyId);*/
+
+		if (threadIdx.x == 0)
+		{
+			s_warpScratch->meshVerts = softbody.mPosition_InvMass;
+			s_warpScratch->meshVertsIndices = softbody.mTetIndices;
+	
+			Gu::BV32DataPacked* bv32PackedNodes = reinterpret_cast<Gu::BV32DataPacked*>(tetmeshGeomPtr);
+			s_warpScratch->bv32PackedNodes = bv32PackedNodes;
+
+		}
+
+		__syncthreads();
+
+		PxReal cDistance = contactDistance[particleCacheRef] + contactDistance[softbodyCacheRef];
+
+		//const PxU32 threadBaseAddress = globalThreadIdx & (~31);
+
+		const PxU32 nbParticles = sParticleSystem.mCommonData.mNumParticles;
+
+		//const PxU32 NbThreads = blockDim.x*blockDim.y*gridDim.x;
+		const PxU32 NbWarps = blockDim.y*gridDim.x;
+
+		float4* sortedPose = reinterpret_cast<float4*>(sParticleSystem.mSortedPositions_InvMass);
+
+		uint4* tStackPtr = reinterpret_cast<uint4*>(stackPtr);
+		const PxU32 stackSize = stackSizeBytes / sizeof(uint4);
+
+		//for (PxU32 i = threadBaseAddress, particleIndex = globalThreadIdx; i < nbParticles; i+= NbThreads, particleIndex += NbThreads)
+		for (PxU32 i = globalWarpIdx; i < nbParticles; i += NbWarps)
+		{
+			float4 pos = make_float4(0.f);
+			//PxU32 particleIndex = 0xFFFFFFFF;
+
+			//if(threadIdx.x == 0)
+			PxU32 particleIndex = i;
+			pos = sortedPose[particleIndex];
+			
+
+			/*if (particleIndex < nbParticles)
+			{
+				pos = sortedPose[particleIndex];
+			}*/
+
+			const PxVec3 particlePos(pos.x, pos.y, pos.z);
+
+			bv32TreeTraversal<const SoftbodyBoxTraverser, MIDPHASE_WARPS_PER_BLOCK>(s_warpScratch->bv32PackedNodes, s_warpScratch->sBv32Nodes, SoftbodyBoxTraverser(s_warpScratch, particlePos - PxVec3(cDistance), particlePos + PxVec3(cDistance), particleIndex,
+				blockIdx.y, stackSize, tStackPtr, midphasePairsNum));
+		}
+	}
+}
+
+
+template<unsigned int WarpsPerBlock>
+__device__ static inline void sb_clothVertMidphaseCore(
+	const PxReal toleranceLength,
+	const PxgContactManagerInput* PX_RESTRICT cmInputs,
+	const PxsCachedTransform* PX_RESTRICT transformCache,
+	const PxBounds3* PX_RESTRICT bounds,
+	const PxReal* PX_RESTRICT contactDistance,
+	const PxgShape* PX_RESTRICT gpuShapes,
+	const PxgSoftBody* PX_RESTRICT softbodies,
+	const PxgFEMCloth* PX_RESTRICT clothes,
+	const PxU32 stackSizeBytes,
+	const PxReal nbCollisionPairUpdatesPerTimestep,
+	PxU8* PX_RESTRICT stackPtr,									//output
+	PxU32* PX_RESTRICT midphasePairsNum,						//output
+	femMidphaseScratch*	s_warpScratch
+)
+{
+	const PxU32 cmIdx = blockIdx.y;
+
+	// each block deals with one pair
+	{
+		const PxU32 globalWarpIdx = threadIdx.y + blockIdx.x*blockDim.y;
+
+		PxgShape softbodyShape, clothShape;
+		PxU32 softbodyCacheRef, clothCacheRef;
+		LoadShapePairWarp<PxGeometryType::eTETRAHEDRONMESH, PxGeometryType::eTRIANGLEMESH>(cmInputs, cmIdx, gpuShapes,
+			softbodyShape, softbodyCacheRef, clothShape, clothCacheRef);
+
+		const PxgSoftBody& softbody = softbodies[softbodyShape.particleOrSoftbodyId];
+		const PxgFEMCloth& cloth = clothes[clothShape.particleOrSoftbodyId];
+
+		PxU8 * tetmeshGeomPtr = reinterpret_cast<PxU8 *>(softbody.mTetMeshData);
+		tetmeshGeomPtr += sizeof(uint4);
+
+		if (threadIdx.x == 0)
+		{
+			s_warpScratch->meshVerts = softbody.mPosition_InvMass;
+			s_warpScratch->meshVertsIndices = softbody.mTetIndices;
+
+			Gu::BV32DataPacked* bv32PackedNodes = reinterpret_cast<Gu::BV32DataPacked*>(tetmeshGeomPtr);
+			s_warpScratch->bv32PackedNodes = bv32PackedNodes;
+		}
+		
+		__syncthreads();
+
+		// if cloth collision pair is updated more than once per time step, early-out the influence of extended
+		// cloth bounding box in broad phase.
+		if (nbCollisionPairUpdatesPerTimestep > 1.f)
+		{
+			PxBounds3 clothBB, softbodyBB;
+			PxBounds3_ReadWarp(clothBB, bounds + clothCacheRef);
+			PxBounds3_ReadWarp(softbodyBB, bounds + softbodyCacheRef);
+
+			clothBB.minimum += (nbCollisionPairUpdatesPerTimestep - 1.f) * PxVec3(contactDistance[clothCacheRef]);
+			clothBB.maximum -= (nbCollisionPairUpdatesPerTimestep - 1.f) * PxVec3(contactDistance[clothCacheRef]);
+
+			if (!clothBB.intersects(softbodyBB)) return;
+		}
+
+		const PxReal cDistance = contactDistance[softbodyCacheRef] + contactDistance[clothCacheRef];
+
+		const PxU32 NbWarps = blockDim.y*gridDim.x;
+
+		const PxU32 nbVerts = cloth.mNbVerts;
+
+		const float4* positions = cloth.mPosition_InvMass;
+
+		uint4* tStackPtr = reinterpret_cast<uint4*>(stackPtr);
+		const PxU32 stackSize = stackSizeBytes / sizeof(uint4);
+
+		for (PxU32 i = globalWarpIdx; i < nbVerts; i += NbWarps)
+		{
+			const float4 tPos = positions[i];
+
+			const PxVec3 pos(tPos.x, tPos.y, tPos.z);
+
+			bv32TreeTraversal<const SoftbodyBoxTraverser, WarpsPerBlock>(s_warpScratch->bv32PackedNodes, s_warpScratch->sBv32Nodes, SoftbodyBoxTraverser(s_warpScratch, pos - PxVec3(cDistance), pos + PxVec3(cDistance), i,
+				blockIdx.y, stackSize, tStackPtr, midphasePairsNum));
+		}
+	}
+}
+
+extern "C" __global__
+//__launch_bounds__(MIDPHASE_WARPS_PER_BLOCK * WARP_SIZE, 4)
+void sb_clothVertMidphaseGeneratePairsLaunch(
+	const PxReal toleranceLength,
+	const PxgContactManagerInput* PX_RESTRICT cmInputs,
+	const PxsCachedTransform* PX_RESTRICT transformCache,
+	const PxBounds3* PX_RESTRICT bounds,
+	const PxReal* PX_RESTRICT contactDistance,
+	const PxgShape* PX_RESTRICT gpuShapes,
+	const PxgSoftBody* PX_RESTRICT softbodies,
+	const PxgFEMCloth* PX_RESTRICT clothes,
+	const PxU32 stackSizeBytes,
+	const PxReal nbCollisionPairUpdatesPerTimestep,
+	PxU8* PX_RESTRICT stackPtr,							//output
+	PxU32* PX_RESTRICT midphasePairsNum					//output
+)
+{
+	__shared__ PxU32 scratchMem[MIDPHASE_WARPS_PER_BLOCK][WARP_SIZE * 18];
+
+	sb_clothVertMidphaseCore<MIDPHASE_WARPS_PER_BLOCK>(
+		toleranceLength,
+		cmInputs,
+		transformCache,
+		bounds,
+		contactDistance,
+		gpuShapes,
+		softbodies,
+		clothes,
+		stackSizeBytes,
+		nbCollisionPairUpdatesPerTimestep,
+		stackPtr,
+		midphasePairsNum,
+		(femMidphaseScratch*)scratchMem[threadIdx.y]
+		);
+}
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/softbodyPrimitives.cu
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/softbodyPrimitives.cu
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/softbodySoftbodyMidPhase.cu
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/softbodySoftbodyMidPhase.cu
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/sphereCollision.cuh
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/sphereCollision.cuh
@@ -0,0 +1,840 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __SPHERE_COLLISION_CUH__
+#define __SPHERE_COLLISION_CUH__
+
+#include "foundation/PxVec3.h"
+#include "foundation/PxTransform.h"
+#include "distanceSegmentSegment.cuh"
+
+//0 is sphere, 1 is sphere, point = [x, y, z, pen];
+__device__ __forceinline__ static
+PxU32 spheresphere(const PxTransform& transform0, const PxTransform& transform1, const PxReal r0, 
+	const PxReal r1, const PxReal contactDist, float4& outPointPen, PxVec3& outNormal)
+{
+	const PxVec3& p0 = transform0.p;
+	const PxVec3& p1 = transform1.p;
+
+	const PxVec3 _delta = p0 - p1;
+	const PxReal distanceSq = _delta.dot(_delta);
+	const PxReal radiusSum = r0 + r1;
+	const PxReal inflatedSum = radiusSum + contactDist;
+
+	if ((inflatedSum * inflatedSum) > distanceSq)
+	{
+		const PxReal eps = 1e-4f;
+		const PxReal dist = PxSqrt(distanceSq);
+		const PxVec3 normal = (eps >= dist) ? PxVec3(1.0f, 0.f, 0.f) : (_delta * (1.f / dist));
+		const PxVec3 point = p1 + (normal * r1);
+		const PxReal pen = dist - radiusSum;
+
+		outPointPen = make_float4(point.x, point.y, point.z, pen);
+		outNormal = normal;
+
+		return 1;
+	}
+
+	return 0;
+}
+
+__device__ __forceinline__ static
+PxU32 spheresphere(const PxVec3& p0, const PxVec3& p1, const PxReal r0,
+	const PxReal r1, const PxReal contactDist, float4& outNormalPen)
+{
+	const PxVec3 _delta = p0 - p1;
+	const PxReal distanceSq = _delta.dot(_delta);
+	const PxReal radiusSum = r0 + r1;
+	const PxReal inflatedSum = radiusSum + contactDist;
+
+	if ((inflatedSum * inflatedSum) > distanceSq)
+	{
+		const PxReal eps = 1e-4f;
+		const PxReal dist = PxSqrt(distanceSq);
+		const PxVec3 normal = (eps >= dist) ? PxVec3(1.0f, 0.f, 0.f) : (_delta * (1.f / dist));
+		const PxReal pen = dist - radiusSum;
+
+		outNormalPen = make_float4(normal.x, normal.y, normal.z, pen);
+
+		return 1;
+	}
+
+	return 0;
+}
+
+__device__ __forceinline__ static
+PxReal distancePointSegmentSquared(const PxVec3 a, const PxVec3 b, const PxVec3 p, PxReal& param)
+{
+	const PxVec3 ap = p - a;
+	const PxVec3 ab = b - a;
+
+	const PxReal nom = ap.dot(ab);
+	const PxReal denom = ab.dot(ab);
+	const PxReal tValue = PxClamp(nom / denom, 0.f, 1.f);
+	const PxReal t = denom > 0.f ? tValue : 0.f;
+	const PxVec3 v = ap - (ab * t);
+	param = t;
+	return v.dot(v);
+}
+
+//0 is sphere, 1 is capsule
+__device__ __forceinline__ static
+PxU32 spherecapsule(const PxTransform& transform0, const PxTransform& transform1, const PxReal sphereRadius, 
+	const PxReal capsuleRadius, const PxReal halfHeight, const PxReal contactDist, float4& outPointPen, PxVec3& outNormal)
+{
+
+	//Sphere in world space
+	const PxVec3& sphereCenter = transform0.p;
+	const PxQuat& q1 = transform1.q;
+	const PxVec3& p1 = transform1.p;
+
+	const PxVec3 basisVector0 = q1.getBasisVector0();
+	const PxVec3 tmp0 = basisVector0 * halfHeight;
+	const PxVec3 s = p1 + tmp0;
+	const PxVec3 e = p1 - tmp0;
+
+	const PxReal radiusSum = sphereRadius + capsuleRadius;
+	const PxReal inflatedSum = radiusSum + contactDist;
+
+	PxReal t;
+	const PxReal squareDist = distancePointSegmentSquared(s, e, sphereCenter, t);
+	const PxReal sqInflatedSum = inflatedSum * inflatedSum;
+
+	// Collision detection
+	if (sqInflatedSum > squareDist)
+	{
+		const PxVec3 p = s + (e - s) * t;//V3ScaleAdd(V3Sub(e, s), t, s);
+		const PxVec3 dir = sphereCenter - p;// V3Sub(sphereCenter, p);
+
+		const PxReal length = PxSqrt(dir.magnitudeSquared());
+		const PxVec3 normal = length > PX_EPS_REAL ? (dir * (1.f / length)) : PxVec3(1.0f, 0.f, 0.f);
+		const PxVec3 point = sphereCenter - normal * sphereRadius;
+		const PxReal pen = PxSqrt(squareDist) - radiusSum;
+
+		outPointPen = make_float4(point.x, point.y, point.z, pen);
+		outNormal = normal;
+
+		/*printf("outPoint(%f, %f, %f), pen %f\n", outPointPen.x, outPointPen.y, outPointPen.z, outPointPen.w);
+		printf("outNormal(%f, %f, %f)\n", outNormal.x, outNormal.y, outNormal.z, 0.f);*/
+
+		return 1;
+	}
+
+	return 0;
+}
+
+//0 is sphere, 1 is capsule
+__device__ __forceinline__ static
+PxU32 spherecapsule(const PxTransform& transform0, const PxTransform& transform1, const PxReal sphereRadius,
+	const PxReal capsuleRadius, const PxReal halfHeight, const PxReal contactDist, float4& outNormalPen, PxReal* outT = NULL)
+{
+
+	//Sphere in world space
+	const PxVec3& sphereCenter = transform0.p;
+	const PxQuat& q1 = transform1.q;
+	const PxVec3& p1 = transform1.p;
+
+	const PxVec3 basisVector0 = q1.getBasisVector0();
+	const PxVec3 tmp0 = basisVector0 * halfHeight;
+	const PxVec3 s = p1 + tmp0;
+	const PxVec3 e = p1 - tmp0;
+
+	const PxReal radiusSum = sphereRadius + capsuleRadius;
+	const PxReal inflatedSum = radiusSum + contactDist;
+
+	PxReal t;
+	const PxReal squareDist = distancePointSegmentSquared(s, e, sphereCenter, t);
+	const PxReal sqInflatedSum = inflatedSum * inflatedSum;
+	if (outT != NULL)
+	{
+		*outT = 1.0f - t;
+	}
+
+	// Collision detection
+	if (sqInflatedSum > squareDist)
+	{
+		const PxVec3 p = s + (e - s) * t;//V3ScaleAdd(V3Sub(e, s), t, s);
+		const PxVec3 dir = sphereCenter - p;// V3Sub(sphereCenter, p);
+
+		const PxReal length = PxSqrt(dir.magnitudeSquared());
+		const PxVec3 normal = length > PX_EPS_REAL ? (dir * (1.f / length)) : PxVec3(1.0f, 0.f, 0.f);
+		const PxReal pen = PxSqrt(squareDist) - radiusSum;
+
+		outNormalPen = make_float4(normal.x, normal.y, normal.z, pen);
+
+		/*printf("outPoint(%f, %f, %f), pen %f\n", outPointPen.x, outPointPen.y, outPointPen.z, outPointPen.w);
+		printf("outNormal(%f, %f, %f)\n", outNormal.x, outNormal.y, outNormal.z, 0.f);*/
+
+		return 1;
+	}
+
+	return 0;
+}
+
+//0 is sphere, 1 is plane
+__device__ __forceinline__ static
+PxU32 sphereplane(const PxTransform& transform0, const PxTransform& transform1, const PxReal sphereRadius, const PxReal contactDist,
+	float4& outPointPen, PxVec3& outNormal)
+{
+	//sphere transform
+	const PxVec3& p0 = transform0.p;
+	const PxVec3& p1 = transform1.p;
+	const PxQuat& q1 = transform1.q;
+
+	const PxTransform transf1(p1, q1);
+	//Sphere in plane space
+	const PxVec3 sphereCenterInPlaneSpace = transf1.transformInv(p0);
+
+	//separation / penetration
+	const PxReal separation = sphereCenterInPlaneSpace.x - sphereRadius;// FSub(V3GetX(sphereCenterInPlaneSpace), radius);
+
+	if (contactDist >= separation)
+	{
+		//get the plane normal
+		const PxVec3 worldNormal = q1.getBasisVector0();
+		const PxVec3 worldPoint = p0 - worldNormal * sphereRadius;//V3NegScaleSub(worldNormal, radius, p0);
+		
+		outPointPen = make_float4(worldPoint.x, worldPoint.y, worldPoint.z, separation);
+		outNormal = worldNormal;
+
+		return 1;
+	}
+
+	return 0;
+}
+
+//0 is sphere, 1 is plane
+__device__ __forceinline__ static
+PxU32 sphereplane(const PxTransform& transform0, const PxTransform& transform1, const PxReal sphereRadius, const PxReal contactDist,
+	float4& outNormalPen)
+{
+	//sphere transform
+	const PxVec3& p0 = transform0.p;
+	const PxVec3& p1 = transform1.p;
+	const PxQuat& q1 = transform1.q;
+
+	const PxTransform transf1(p1, q1);
+	//Sphere in plane space
+	const PxVec3 sphereCenterInPlaneSpace = transf1.transformInv(p0);
+
+	//separation / penetration
+	const PxReal separation = sphereCenterInPlaneSpace.x - sphereRadius;// FSub(V3GetX(sphereCenterInPlaneSpace), radius);
+
+	if (contactDist >= separation)
+	{
+		//get the plane normal
+		const PxVec3 worldNormal = q1.getBasisVector0();
+		outNormalPen = make_float4(worldNormal.x, worldNormal.y, worldNormal.z, separation);
+
+		return 1;
+	}
+
+	return 0;
+}
+
+//0 is sphere, 1 is box
+__device__ __forceinline__ static
+PxU32 spherebox(const PxTransform& transform0, const PxTransform& transform1, const PxReal sphereRadius, 
+	const PxVec3 boxExtents, const PxReal contactDist, float4& outPointPen, PxVec3& outNormal)
+{
+	const PxVec3& sphereOrigin = transform0.p;
+
+	//const PxQuat& q1 = transform1.q;
+	//const PxVec3& p1 = transform1.p;
+
+	//const PxTransform transf1(p1, q1);
+
+	//translate sphere center into the box space
+	const PxVec3 sphereCenter = transform1.transformInv(sphereOrigin);
+
+	const PxVec3 nBoxExtents = -boxExtents;
+
+	const PxReal inflatedSum = sphereRadius + contactDist;
+	const PxReal sqInflatedSum = inflatedSum * inflatedSum;
+
+	const PxReal x = PxClamp(sphereCenter.x, nBoxExtents.x, boxExtents.x);
+	const PxReal y = PxClamp(sphereCenter.y, nBoxExtents.y, boxExtents.y);
+	const PxReal z = PxClamp(sphereCenter.z, nBoxExtents.z, boxExtents.z);
+	const PxVec3 p(x, y, z);
+	const PxVec3 v = sphereCenter - p;
+	const PxReal lengthSq = v.dot(v);
+
+	if (sqInflatedSum > lengthSq)//intersect
+	{
+		//check whether the spherCenter is inside the box
+		const bool bX = boxExtents.x >= PxAbs(sphereCenter.x);
+		const bool bY = boxExtents.y >= PxAbs(sphereCenter.y);
+		const bool bZ = boxExtents.z >= PxAbs(sphereCenter.z);
+		
+		const bool bInsideBox = bX && bY && bZ;// V3IsGrtrOrEq(boxExtents, V3Abs(sphereCenter));
+		if (bInsideBox)//sphere center inside the box
+		{
+			//Pick directions and sign
+			const PxVec3 absP(PxAbs(p.x), PxAbs(p.y), PxAbs(p.z));
+			const PxVec3 distToSurface = boxExtents - absP;//dist from embedded center to box surface along 3 dimensions.
+			
+			//assume z is the smallest elemtn of the distToSurface
+			PxVec3 localNorm = p.z >= 0.f ? PxVec3(0.f, 0.f, 1.f) : PxVec3(0.f, 0.f, -1.f);
+			PxReal dist = -distToSurface.z;
+			//find smallest element of distToSurface
+			if (distToSurface.x <= distToSurface.y && distToSurface.x <= distToSurface.z)
+			{
+				localNorm = p.x >= 0.f ? PxVec3(1.f, 0.f, 0.f) : PxVec3(-1.0f, 0.f, 0.f);
+				dist = -distToSurface.x;
+			}
+			else if (distToSurface.y <= distToSurface.z)
+			{
+				localNorm = p.y >= 0.f ? PxVec3(0.f, 1.f, 0.f) : PxVec3(0.f, -1.f, 0.f);
+				dist = -distToSurface.y;
+			}
+		/*	printf("p(%f, %f, %f)\n", p.x, p.y, p.z);
+			printf("distToSurface(%f, %f, %f)\n", distToSurface.x, distToSurface.y, distToSurface.z);
+			printf("dist %f\n", dist);*/
+
+			//separation so far is just the embedding of the center point; we still have to push out all of the radius.
+			const PxVec3 normal = transform1.rotate(localNorm);
+			const PxReal penetration = dist - sphereRadius;
+			const PxVec3 point = sphereOrigin - normal * dist;
+
+			outPointPen = make_float4(point.x, point.y, point.z, penetration);
+			outNormal = normal;
+
+			//printf("Inside the box\n");
+
+			/*printf("pentration %f\n", penetration);
+			printf("tP(%f, %f, %f), tQ(%f, %f, %f, %f)\n", transf1.p.x, transf1.p.y, transf1.p.z, transf1.q.x, transf1.q.y, transf1.q.z, transf1.q.w);
+
+			printf("outPoint(%f, %f, %f), pen %f\n", outPointPen.x, outPointPen.y, outPointPen.z, outPointPen.w);
+			printf("outNormal(%f, %f, %f)\n", outNormal.x, outNormal.y, outNormal.z, 0.f);*/
+		}
+		else
+		{
+			//get the closest point from the center to the box surface
+			const PxReal recipLength = PxRecipSqrt(lengthSq);
+			const PxReal length = 1 / recipLength;
+			const PxVec3 locNorm = v * recipLength;
+			const PxReal penetration = length - sphereRadius;
+			const PxVec3 normal = transform1.rotate(locNorm);
+			const PxVec3 point = transform1.transform(p);
+
+			outPointPen = make_float4(point.x, point.y, point.z, penetration);
+			outNormal = normal;
+
+
+			
+			/*printf("recipLength %f, length %f \n", recipLength, length);
+			printf("locNorm(%f, %f, %f)\n", locNorm.x, locNorm.y, locNorm.z);
+			printf("pentration %f\n", penetration);
+			printf("tP(%f, %f, %f), tQ(%f, %f, %f, %f)\n", transf1.p.x, transf1.p.y, transf1.p.z, transf1.q.x, transf1.q.y, transf1.q.z, transf1.q.w);
+
+			printf("outPoint(%f, %f, %f), pen %f\n", outPointPen.x, outPointPen.y, outPointPen.z, outPointPen.w);
+			printf("outNormal(%f, %f, %f)\n", outNormal.x, outNormal.y, outNormal.z, 0.f);*/
+			
+			
+	
+		}
+		return 1;
+	}
+
+	//printf("Not intersect! \n");
+
+	return 0;
+}
+
+//0 is sphere, 1 is box
+__device__ __forceinline__ static
+PxU32 spherebox(const PxVec3& sphereOrigin, const PxTransform& transform1, const PxReal sphereRadius,
+	const PxVec3 boxExtents, const PxReal contactDist, float4& outNormalPen)
+{
+	/*const PxVec3& sphereOrigin = transform0.p;
+
+	const PxQuat& q1 = transform1.q;
+	const PxVec3& p1 = transform1.p;
+
+	const PxTransform transf1(p1, q1);*/
+
+	//translate sphere center into the box space
+	const PxVec3 sphereCenter = transform1.transformInv(sphereOrigin);
+
+	const PxVec3 nBoxExtents = -boxExtents;
+
+	const PxReal inflatedSum = sphereRadius + contactDist;
+	const PxReal sqInflatedSum = inflatedSum * inflatedSum;
+
+	const PxReal x = PxClamp(sphereCenter.x, nBoxExtents.x, boxExtents.x);
+	const PxReal y = PxClamp(sphereCenter.y, nBoxExtents.y, boxExtents.y);
+	const PxReal z = PxClamp(sphereCenter.z, nBoxExtents.z, boxExtents.z);
+	const PxVec3 p(x, y, z);
+	const PxVec3 v = sphereCenter - p;
+	const PxReal lengthSq = v.dot(v);
+
+	if (sqInflatedSum > lengthSq)//intersect
+	{
+		//check whether the spherCenter is inside the box
+		const bool bX = boxExtents.x >= PxAbs(sphereCenter.x);
+		const bool bY = boxExtents.y >= PxAbs(sphereCenter.y);
+		const bool bZ = boxExtents.z >= PxAbs(sphereCenter.z);
+
+		const bool bInsideBox = bX && bY && bZ;// V3IsGrtrOrEq(boxExtents, V3Abs(sphereCenter));
+		if (bInsideBox)//sphere center inside the box
+		{
+			//Pick directions and sign
+			const PxVec3 absP(PxAbs(p.x), PxAbs(p.y), PxAbs(p.z));
+			const PxVec3 distToSurface = boxExtents - absP;//dist from embedded center to box surface along 3 dimensions.
+
+														   //assume z is the smallest elemtn of the distToSurface
+			PxVec3 localNorm = p.z >= 0.f ? PxVec3(0.f, 0.f, 1.f) : PxVec3(0.f, 0.f, -1.f);
+			PxReal dist = -distToSurface.z;
+			//find smallest element of distToSurface
+			if (distToSurface.x <= distToSurface.y && distToSurface.x <= distToSurface.z)
+			{
+				localNorm = p.x >= 0.f ? PxVec3(1.f, 0.f, 0.f) : PxVec3(-1.0f, 0.f, 0.f);
+				dist = -distToSurface.x;
+			}
+			else if (distToSurface.y <= distToSurface.z)
+			{
+				localNorm = p.y >= 0.f ? PxVec3(0.f, 1.f, 0.f) : PxVec3(0.f, -1.f, 0.f);
+				dist = -distToSurface.y;
+			}
+		
+			//separation so far is just the embedding of the center point; we still have to push out all of the radius.
+			const PxVec3 normal = transform1.rotate(localNorm);
+			const PxReal penetration = dist - sphereRadius;
+
+			outNormalPen = make_float4(normal.x, normal.y, normal.z, penetration);
+
+		}
+		else
+		{
+			//get the closest point from the center to the box surface
+			const PxReal recipLength = PxRecipSqrt(lengthSq);
+			const PxReal length = 1 / recipLength;
+			const PxVec3 locNorm = v * recipLength;
+			const PxReal penetration = length - sphereRadius;
+			const PxVec3 normal = transform1.rotate(locNorm);
+			
+			outNormalPen = make_float4(normal.x, normal.y, normal.z, penetration);
+		}
+		return 1;
+	}
+
+	return 0;
+}
+
+//0 is sphere, 1 is box(sphere is in the local space of box), normal is the in the local space of box
+__device__ __forceinline__ static
+PxU32 spherebox(const PxVec3& sphereCenter, const PxReal sphereRadius,
+	const PxVec3 boxExtents, const PxReal contactDist, float4& outNormalPen)
+{
+
+	const PxVec3 nBoxExtents = -boxExtents;
+
+	const PxReal inflatedSum = sphereRadius + contactDist;
+	const PxReal sqInflatedSum = inflatedSum * inflatedSum;
+
+	const PxReal x = PxClamp(sphereCenter.x, nBoxExtents.x, boxExtents.x);
+	const PxReal y = PxClamp(sphereCenter.y, nBoxExtents.y, boxExtents.y);
+	const PxReal z = PxClamp(sphereCenter.z, nBoxExtents.z, boxExtents.z);
+	const PxVec3 p(x, y, z);
+	const PxVec3 v = sphereCenter - p;
+	const PxReal lengthSq = v.dot(v);
+
+	if (sqInflatedSum > lengthSq)//intersect
+	{
+		//check whether the spherCenter is inside the box
+		const bool bX = boxExtents.x >= PxAbs(sphereCenter.x);
+		const bool bY = boxExtents.y >= PxAbs(sphereCenter.y);
+		const bool bZ = boxExtents.z >= PxAbs(sphereCenter.z);
+
+		const bool bInsideBox = bX && bY && bZ;// V3IsGrtrOrEq(boxExtents, V3Abs(sphereCenter));
+		if (bInsideBox)//sphere center inside the box
+		{
+			//Pick directions and sign
+			const PxVec3 absP(PxAbs(p.x), PxAbs(p.y), PxAbs(p.z));
+			const PxVec3 distToSurface = boxExtents - absP;//dist from embedded center to box surface along 3 dimensions.
+
+														   //assume z is the smallest elemtn of the distToSurface
+			PxVec3 localNorm = p.z >= 0.f ? PxVec3(0.f, 0.f, 1.f) : PxVec3(0.f, 0.f, -1.f);
+			PxReal dist = -distToSurface.z;
+			//find smallest element of distToSurface
+			if (distToSurface.x <= distToSurface.y && distToSurface.x <= distToSurface.z)
+			{
+				localNorm = p.x >= 0.f ? PxVec3(1.f, 0.f, 0.f) : PxVec3(-1.0f, 0.f, 0.f);
+				dist = -distToSurface.x;
+			}
+			else if (distToSurface.y <= distToSurface.z)
+			{
+				localNorm = p.y >= 0.f ? PxVec3(0.f, 1.f, 0.f) : PxVec3(0.f, -1.f, 0.f);
+				dist = -distToSurface.y;
+			}
+
+			//separation so far is just the embedding of the center point; we still have to push out all of the radius.
+			const PxReal penetration = dist - sphereRadius;
+
+			outNormalPen = make_float4(localNorm.x, localNorm.y, localNorm.z, penetration);
+
+		}
+		else
+		{
+			//get the closest point from the center to the box surface
+			const PxReal recipLength = PxRecipSqrt(lengthSq);
+			const PxReal length = 1 / recipLength;
+			const PxVec3 locNorm = v * recipLength;
+			const PxReal penetration = length - sphereRadius;
+			
+			outNormalPen = make_float4(locNorm.x, locNorm.y, locNorm.z, penetration);
+		}
+		return 1;
+	}
+
+	return 0;
+}
+
+
+
+//0 is plane, 1 is capsule
+__device__ __forceinline__ static
+PxU32 planeCapsule(const PxTransform& transform0, const PxTransform& transform1,
+	const PxReal radius, const PxReal halfHeight, const PxReal contactDist,
+	float4 *outPointPen, PxVec3& outNormal, PxReal* t = NULL)
+{
+	//capsule to plane
+	const PxTransform bToA = transform0.transformInv(transform1);
+
+	//in world space
+	const PxVec3 planeNormal = transform0.q.getBasisVector0().getNormalized();
+	//const PxVec3 contactNormal =-planeNormal;
+
+	outNormal = -planeNormal;
+	//unit x
+	const PxVec3 localNormal(1.f, 0.f, 0.f);
+
+	//printf("outNormal (%f, %f, %f)\n", outNormal.x, outNormal.y, outNormal.z);
+
+	//capsule is in the local space of plane(n = (1.f, 0.f, 0.f), d=0.f)
+	const PxVec3 basisVector = bToA.q.getBasisVector0();
+	const PxVec3 tmp = basisVector * halfHeight;
+	const PxVec3 s = bToA.p + tmp;
+	const PxVec3 e = bToA.p - tmp;
+
+	const PxReal inflatedRadius = radius + contactDist;
+	PxU32 numContacts = 0;
+	const PxReal signDist0 = s.x;//V3Dot(localNormal, s);
+	if (inflatedRadius >= signDist0)
+	{
+		const PxVec3 worldPoint = transform0.transform(s) - planeNormal * radius;
+		const PxReal pen = signDist0 - radius;
+		//printf("s localPoint(%f, %f, %f) pen %f\n", localPoint.x, localPoint.y, localPoint.z, pen);
+		outPointPen[numContacts] = make_float4(worldPoint.x, worldPoint.y, worldPoint.z, pen);
+		numContacts++;
+	}
+
+	const PxReal signDist1 = e.x;//V3Dot(localNormal, s);
+	if (inflatedRadius >= signDist1)
+	{
+		const PxVec3 worldPoint = transform0.transform(e) - planeNormal * radius;
+		const PxReal pen = signDist1 - radius;
+		//printf("e worldPoint(%f, %f, %f) pen %f\n", worldPoint.x, worldPoint.y, worldPoint.z, pen);
+		//printf("e normal(%f, %f, %f)\n", outNormal.x, outNormal.y, outNormal.z);
+		outPointPen[numContacts] = make_float4(worldPoint.x, worldPoint.y, worldPoint.z, pen);
+		numContacts++;
+	}
+
+	if (t != NULL)
+	{
+		if (contactDist > 0.0f)
+		{
+			// *t = PxClamp(0.5f * (signDist1 - signDist0) / contactDist + 0.5f, 0.0f, 1.0f);
+			*t = 0.5f + 0.5f * tanh(5.0f * (signDist1 - signDist0) / contactDist);
+		}
+		else
+		{
+			*t = signDist0 > signDist1 ? 0.0f : 1.0f;
+		}
+	}
+
+	return numContacts;
+}
+
+
+__device__ __forceinline__ static
+void pcmDistancePointSegmentTValue22(
+	const PxVec3& a0, const PxVec3& b0,
+	const PxVec3& a1, const PxVec3& b1,
+	const PxVec3& p0, const PxVec3& p1,
+	const PxVec3& p2, const PxVec3& p3,
+	PxReal& t0, PxReal& t1, 
+	PxReal& t2, PxReal& t3)
+{
+	
+	const PxVec3 ap00 = p0 - a0;
+	const PxVec3 ap10 = p1 - a0;
+	const PxVec3 ap01 = p2 - a1;
+	const PxVec3 ap11 = p3 - a1;
+
+	const PxVec3 ab0 = b0 - a0;
+	const PxVec3 ab1 = b1 - a1;
+
+	/*	const FloatV nom00 = V3Dot(ap00, ab0);
+		const FloatV nom10 = V3Dot(ap10, ab0);
+		const FloatV nom01 = V3Dot(ap01, ab1);
+		const FloatV nom11 = V3Dot(ap11, ab1);*/
+
+	const PxReal nom00 = ap00.dot(ab0);
+	const PxReal nom10 = ap10.dot(ab0);
+	const PxReal nom01 = ap01.dot(ab1);
+	const PxReal nom11 = ap11.dot(ab1);
+
+	/*const Vec4V combinedDot = V3Dot4(ap00, ab0, ap10, ab0, ap01, ab1, ap11, ab1);
+	const FloatV nom00 = V4GetX(combinedDot);
+	const FloatV nom10 = V4GetY(combinedDot);
+	const FloatV nom01 = V4GetZ(combinedDot);
+	const FloatV nom11 = V4GetW(combinedDot);*/
+
+	const PxReal denom0 = ab0.dot(ab0);
+	const PxReal denom1 = ab1.dot(ab1);
+
+	/*const FloatV denom0 = V3Dot(ab0, ab0);
+	const FloatV denom1 = V3Dot(ab1, ab1);*/
+
+	const PxReal eps = 1e-6f;
+	t0 = (PxAbs(denom0) < eps) ? 0.f : (nom00 / denom0);
+	t1 = (PxAbs(denom0) < eps) ? 0.f : (nom10 / denom0);
+	t2 = (PxAbs(denom1) < eps) ? 0.f : (nom01 / denom1);
+	t3 = (PxAbs(denom1) < eps) ? 0.f : (nom11 / denom1);
+
+	/*const Vec4V nom = V4Merge(nom00, nom10, nom01, nom11);
+	const Vec4V denom = V4Merge(denom0, denom0, denom1, denom1);
+
+	const Vec4V tValue = V4Div(nom, denom);
+	return V4Sel(V4IsEq(denom, zero), zero, tValue);*/
+}
+
+
+
+__device__ __forceinline__ static
+PxU32 capsuleCapsule(const PxTransform& transform0, const PxTransform& transform1,
+	const PxReal radius0, const PxReal halfHeight0, const PxReal radius1, const PxReal halfHeight1,
+	const PxReal contactDist,
+	float4 *outPointPen, PxVec3& outNormal, PxReal* outT0 = NULL, PxReal* outT1 = NULL)
+{
+
+	PxU32 numContacts = 0;
+
+	const PxVec3 positionOffset = (transform0.p + transform1.p) * 0.5f;
+	const PxVec3 p0 = transform0.p - positionOffset;
+	const PxVec3 p1 = transform1.p - positionOffset;
+
+	const PxVec3 basisVector0 = transform0.q.getBasisVector0();
+	const PxVec3 tmp0 = basisVector0 * halfHeight0;
+	const PxVec3 s0 = p0 + tmp0;
+	const PxVec3 e0 = p0 - tmp0;
+	const PxVec3 d0 = e0 - s0;
+
+	const PxVec3 basisVector1 = transform1.q.getBasisVector0();
+	const PxVec3 tmp1 = basisVector1 * halfHeight1;
+	const PxVec3 s1 = p1 + tmp1;
+	const PxVec3 e1 = p1 - tmp1;
+	const PxVec3 d1 = e1 - s1;
+
+	const PxReal sumRadius = radius0 + radius1; //FAdd(r0, r1);
+	const PxReal inflatedSum = sumRadius + contactDist;
+	const PxReal inflatedSumSquared = inflatedSum * inflatedSum; 
+	const PxReal a = d0.dot(d0);//squared length of segment1
+	const PxReal e = d1.dot(d1);//squared length of segment2
+	const PxReal eps = 1e-6f;//FEps();
+
+	PxReal t0, t1;
+	distanceSegmentSegmentSquared(s0, d0, s1, d1, t0, t1);
+	if (outT0 != NULL)
+	{
+		*outT0 = (1.0f - t0);
+	}
+	if (outT1 != NULL)
+	{
+		*outT1 = (1.0f - t1);
+	}
+
+	const PxVec3 closestA = s0 + d0 * t0;
+	const PxVec3 closestB = s1 + d1 * t1;
+	const PxVec3 vv = closestA - closestB;
+	const PxReal sqDist0 = vv.dot(vv);
+
+	if (eps > sqDist0)
+	{
+		const PxVec3 _normal = transform0.q.rotateInv(PxVec3(0.f, 1.f, 0.f));
+		const PxVec3 normal = _normal.getNormalized();
+
+		const PxVec3 _point = closestA - normal * radius0;
+		const PxVec3 p = _point + positionOffset;
+		const PxReal pen = -radius0;
+		outPointPen[0] = make_float4(p.x, p.y, p.z, pen);
+		//printf("This is two segment inline with each other");
+
+		outNormal = normal;
+		numContacts = 1;
+	}
+	else
+	{ 
+		if (inflatedSumSquared >= sqDist0)
+		{
+			//check to see whether these two capsule are paralle
+			const PxReal parallelTolerance = 0.9998f;
+
+			const PxVec3 dir0 = (eps > a) ? PxVec3(0.f) : (d0 * (1.f / PxSqrt(a)));
+			const PxVec3 dir1 = (eps > e) ? PxVec3(0.f) : (d1 * (1.f / PxSqrt(e)));
+
+			const PxReal cos = PxAbs(dir0.dot(dir1));
+			if (cos > parallelTolerance)//paralle
+			{
+				//tV0 is s1 project to s0e0
+				//tV1 is e1 project to s0e0
+				//tV2 is s0 project to s1e1
+				//tV3 is e0 project to s1e1
+				PxReal tV0, tV1, tV2, tV3;
+				pcmDistancePointSegmentTValue22(s0, e0, s1, e1,
+					s1, e1, s0, e0, tV0, tV1, tV2, tV3);
+
+				//printf("t(%f %f %f %f)\n", tV0, tV1, tV2, tV3);
+
+				if (tV0 >= 0.f && tV0 <= 1.f)
+				{
+					const PxVec3 projS1 = s0 + d0 * tV0;
+					const PxVec3 v = projS1 - s1;
+					const PxReal sqDist = v.dot(v);
+					
+					const PxReal dist = PxSqrt(sqDist);
+					const PxReal pen = dist - sumRadius;
+					const PxVec3 normal = v / dist;// V3ScaleInv(v, dist);
+					//assert(isFiniteVec3V(normal));
+					const PxVec3 _p = projS1 - normal * radius0;// V3NegScaleSub(normal, r0, projS1);
+					const PxVec3 p = _p + positionOffset;// V3Add(_p, positionOffset);
+					outPointPen[numContacts] = make_float4(p.x, p.y, p.z, pen);
+					outNormal = normal;
+
+					//printf("_p(%f, %f, %f)\n", _p.x, _p.y, _p.z);
+					//printf("p(%f, %f, %f pen %f)\n", p.x, p.y, p.z, pen);
+					//printf("normal(%f, %f, %f)\n", normal.x, normal.y, normal.z);
+					//storeContact(p, normal, pen, contactBuffer);
+					numContacts++;
+					
+				}
+				if (tV1 >= 0.f && tV1 <= 1.f)
+				{
+					const PxVec3 projE1 = s0 + d0 * tV1;
+					const PxVec3 v = projE1 - e1;
+					const PxReal sqDist = v.dot(v);
+
+					const PxReal dist = PxSqrt(sqDist);
+					const PxReal pen = dist - sumRadius;
+					const PxVec3 normal = v / dist; // V3ScaleInv(v, dist);
+					//assert(isFiniteVec3V(normal));
+					const PxVec3 _p = projE1 - normal * radius0;// V3NegScaleSub(normal, r0, projE1);
+					const PxVec3 p = _p + positionOffset;
+					//storeContact(p, normal, pen, contactBuffer);
+
+					outPointPen[numContacts] = make_float4(p.x, p.y, p.z, pen);
+					outNormal = normal;
+
+					//printf("p(%f, %f, %f pen %f)\n", p.x, p.y, p.z, pen);
+					//printf("normal(%f, %f, %f)\n", normal.x, normal.y, normal.z);
+					numContacts++;
+					
+				}
+
+				if (numContacts < 2 && (tV2 >= 0.f && tV2 <= 1.f))
+				{
+					const PxVec3 projS0 = s1 + d1 * tV2;
+					const PxVec3 v = s0 - projS0;
+					const PxReal sqDist = v.dot(v);
+
+					const PxReal dist = PxSqrt(sqDist);
+					const PxReal pen = dist - sumRadius;
+					const PxVec3 normal = v / dist;// V3ScaleInv(v, dist);
+					//assert(isFiniteVec3V(normal));
+					const PxVec3 _p = s0 - normal * radius0;// V3NegScaleSub(normal, r0, s0);
+					const PxVec3 p = _p + positionOffset;
+					outPointPen[numContacts] = make_float4(p.x, p.y, p.z, pen);
+					outNormal = normal;
+
+					//printf("p(%f, %f, %f pen %f)\n", p.x, p.y, p.z, pen);
+					//printf("normal(%f, %f, %f)\n", normal.x, normal.y, normal.z);
+					numContacts++;
+					
+				}
+
+				if (numContacts < 2 && (tV3 >= 0.f && tV3 <= 1.f))
+				{
+					const PxVec3 projE0 = s1 + d1 * tV3;// V3ScaleAdd(d1, V4GetW(t), s1);
+					const PxVec3 v = e0 - projE0;// V3Sub(e0, projE0);
+					const PxReal sqDist = v.dot(v);// V3Dot(v, v);
+					
+					const PxReal dist = PxSqrt(sqDist);
+					const PxReal pen = dist - sumRadius;
+					const PxVec3 normal = v / dist;
+					//assert(isFiniteVec3V(normal));
+					const PxVec3 _p = e0 - normal * radius0;// V3NegScaleSub(normal, r0, e0);
+					const PxVec3 p = _p + positionOffset;// V3Add(_p, positionOffset);
+					
+					outPointPen[numContacts] = make_float4(p.x, p.y, p.z, pen);
+					outNormal = normal;
+
+					//printf("p(%f, %f, %f pen %f)\n", p.x, p.y, p.z, pen);
+					//printf("normal(%f, %f, %f)\n", normal.x, normal.y, normal.z);
+					numContacts++;
+					
+				}
+			}
+			
+			if(numContacts == 0)
+			{
+				const PxVec3 normal = vv.getNormalized();
+
+				const PxVec3 _point = closestA - normal * radius0;
+				const PxVec3 p = _point + positionOffset;
+
+				const PxReal dist = PxSqrt(sqDist0);
+				const PxReal pen = dist - sumRadius;
+
+				outPointPen[0] = make_float4(p.x, p.y, p.z, pen);
+				outNormal = normal;
+
+				//printf("one point p(%f, %f, %f pen %f)\n", p.x, p.y, p.z, pen);
+				//printf("one point normal(%f, %f, %f)\n", normal.x, normal.y, normal.z);
+
+				numContacts = 1;
+			}
+		}
+	}
+
+	return numContacts;
+
+}
+
+
+#endif
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/sphereTriangle.cuh
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/sphereTriangle.cuh
@@ -0,0 +1,163 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __SPHERE_TRIANGLE_CUH__
+#define __SPHERE_TRIANGLE_CUH__
+
+//#include "shuffle.cuh"
+#include "nputils.cuh"
+#include "dataReadWriteHelper.cuh"
+#include "convexNpCommon.h"
+#include "cudaNpCommon.h"
+
+#include "triangle.cuh"
+#include "stdio.h"
+
+__device__ static PxReal distancePointTriangleSquared(const PxVec3 p,
+	const PxVec3 a,
+	const PxVec3 b,
+	const PxVec3 c,
+	const uint4 adjIdxs,
+	PxVec3& closestP,
+	bool& generateContact,
+	bool& faceContact,
+	PxU32* mask=NULL)
+{
+	faceContact = false;
+
+	// Check if P in vertex region outside A
+	const PxVec3 ab = b - a;
+	const PxVec3 ac = c - a;
+	const PxVec3 ap = p - a;
+	const PxReal d1 = ab.dot(ap);
+	const PxReal d2 = ac.dot(ap);
+	if (d1 < 0.0f && d2 < 0.0f)
+	{
+		if(mask)
+			*mask = PxU32(ConvexTriIntermediateData::eV0);
+		generateContact = (!isEdgeNonconvex(adjIdxs.x)) || (!isEdgeNonconvex(adjIdxs.z));
+		closestP = a;
+		return ap.magnitudeSquared();	// Barycentric coords 1,0,0
+	}
+	// Check if P in vertex region outside B
+	const PxVec3 bp = p - b;
+	const PxReal d3 = ab.dot(bp);
+	const PxReal d4 = ac.dot(bp);
+	if (d3 > 0.0f && d4 < d3)
+	{
+		if(mask)
+			*mask = PxU32(ConvexTriIntermediateData::eV1);
+		generateContact = (!isEdgeNonconvex(adjIdxs.x)) || (!isEdgeNonconvex(adjIdxs.y));
+		closestP = b;
+		return bp.magnitudeSquared();	// Barycentric coords 0,1,0
+	}
+	// Check if P in vertex region outside C
+	const PxVec3 cp = p - c;
+	const PxReal d5 = ab.dot(cp);
+	const PxReal d6 = ac.dot(cp);
+	if (d6 > 0.0f && d5 < d6)
+	{
+		if (mask)
+			*mask = PxU32(ConvexTriIntermediateData::eV2);
+		generateContact = (!isEdgeNonconvex(adjIdxs.y)) || (!isEdgeNonconvex(adjIdxs.z));
+		closestP = c;
+		return cp.magnitudeSquared();	// Barycentric coords 0,0,1
+	}
+
+	PxReal vc = d1 * d4 - d3 * d2;
+	PxReal vb = d5 * d2 - d1 * d6;
+	PxReal va = d3 * d6 - d5 * d4;
+
+	const float edgeTol = (va + vb + vc) * 1e-5;
+
+	// Check if P in edge region of AB, if so return projection of P onto AB
+	if (vc < 0.0f && d1 >= 0.0f && d3 <= 0.0f)
+	{
+		if(mask)
+			*mask = PxU32(ConvexTriIntermediateData::eE01);
+		generateContact = !isEdgeNonconvex(adjIdxs.x) || vc > -edgeTol;
+		const PxReal v = d1 / (d1 - d3);
+		closestP = a + v * ab;	// barycentric coords (1-v, v, 0)
+		return (closestP - p).magnitudeSquared();
+	}
+	
+	// Check if P in edge region of AC, if so return projection of P onto AC	
+	if (vb < 0.0f && d2 >= 0.0f && d6 <= 0.0f)
+	{
+		if(mask)
+			*mask = PxU32(ConvexTriIntermediateData::eE02);
+		generateContact = !isEdgeNonconvex(adjIdxs.z) || vb > -edgeTol;
+		const PxReal w = d2 / (d2 - d6);
+		closestP = a + w * ac;	// barycentric coords (1-w, 0, w)
+		return (closestP - p).magnitudeSquared();
+	}
+
+	// Check if P in edge region of BC, if so return projection of P onto BC	
+	if (va < 0.0f && d4 >= d3 && d5 >= d6)
+	{
+		if(mask)
+			*mask = PxU32(ConvexTriIntermediateData::eE12);
+		generateContact = !isEdgeNonconvex(adjIdxs.y) || va > -edgeTol;
+		const PxReal w = (d4 - d3) / ((d4 - d3) + (d5 - d6));
+		closestP = b + w * (c - b);	// barycentric coords (0, 1-w, w)
+		return (closestP - p).magnitudeSquared();
+	}
+
+	generateContact = true;
+	faceContact = true;
+
+	if(mask)
+		*mask = 0;
+
+	// P inside face region. Compute Q through its barycentric coords (u,v,w)	
+	closestP = a + (ab * vb + ac * vc)/ (va + vb + vc);
+	return (closestP - p).magnitudeSquared();
+}
+
+__device__ inline PxReal scalarTriple(const PxVec3& a, const PxVec3& b, const PxVec3& c)
+{
+	return a.cross(b).dot(c);
+}
+
+// intersects a line (through points p and q, against a triangle a, b, c - mostly taken from Real Time Collision Detection - p186
+__device__ inline bool intersectLineTri(const PxVec3& p, const PxVec3& q, const PxVec3& a, const PxVec3& b, const PxVec3& c)
+{
+	const PxVec3 pq = q - p;
+	const PxVec3 pa = a - p;
+	const PxVec3 pb = b - p;
+	const PxVec3 pc = c - p;
+
+	PxVec3 m = pq.cross(pc);
+	PxReal u = pb.dot(m);
+	PxReal v = -pa.dot(m);
+	PxReal w = scalarTriple(pq, pb, pa);
+
+	return u >= 0.f && v >= 0.f && w >= 0.f;
+}
+
+#endif
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/triangle.cuh
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/triangle.cuh
@@ -0,0 +1,43 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __TRIANGLE_COMMON_CUH__
+#define __TRIANGLE_COMMON_CUH__
+
+#define TRI_FEATURE_IDX		0x7FffFFff
+#define EDGE_FEATURE_IDX	0x80000000
+
+static const physx::PxU32 BOUNDARY = 0xffffffff;
+static const physx::PxU32 NONCONVEX_FLAG = 0x80000000;
+
+__device__ static bool isEdgeNonconvex(physx::PxU32 adjEdgeIndex)
+{
+	return (adjEdgeIndex != BOUNDARY) && (adjEdgeIndex & NONCONVEX_FLAG);
+}
+
+#endif
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/triangleMesh.cuh
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/triangleMesh.cuh
@@ -0,0 +1,198 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __TRIANGLE_MESH_CUH__
+#define __TRIANGLE_MESH_CUH__
+
+#include "GuBV32.h"
+#include "PxgConvexConvexShape.h"
+#include "foundation/PxVec3.h"
+
+namespace physx
+{
+
+struct PxgTriangleMesh
+{
+	PxU32 numVerts;
+	PxU32 numTris;
+	PxU32 numBv32TreeNodes;
+	PxU32 nbPackedNodes;
+
+	const Gu::BV32DataPacked* bv32PackedNodes;
+
+	const float4* trimeshVerts;
+	const uint4* indices;
+
+	const PxU32* PX_RESTRICT trimeshFaceRemap;
+
+	uint4 sdfDims;
+	PxVec3 meshLower;
+	PxReal spacing;
+	PxU32 subgridSize;
+	PxReal subgridsMinSdfValue;
+	PxReal subgridsMaxSdfValue;
+	PxU32 pad;
+
+	//const PxReal* sdf;
+	CUtexObject mTexObject;
+	CUtexObject mTexObjectSparse;
+
+	const PxU32* subgridStarts;
+};
+
+static PX_FORCE_INLINE __device__ uint4 readTriangleMesh(const PxU8* PX_RESTRICT & trimeshGeomPtr,
+	const Gu::BV32DataPacked* PX_RESTRICT & bv32PackedNodes, const float4* PX_RESTRICT & trimeshVerts)
+{
+	const uint4 trimesh_nbVerts_nbTris_nbAdjVertsTotal = *reinterpret_cast<const uint4 *>(trimeshGeomPtr);
+	trimeshGeomPtr += sizeof(uint4);
+
+	bv32PackedNodes = reinterpret_cast<const Gu::BV32DataPacked*>(trimeshGeomPtr);
+
+	trimeshGeomPtr += sizeof(const Gu::BV32DataPacked)* trimesh_nbVerts_nbTris_nbAdjVertsTotal.w;
+
+	trimeshVerts = reinterpret_cast<const float4 *>(trimeshGeomPtr);
+
+	return trimesh_nbVerts_nbTris_nbAdjVertsTotal;
+}
+
+static PX_FORCE_INLINE __device__ uint4 readTriangleMesh(const PxU8 * PX_RESTRICT & trimeshGeomPtr,
+	const Gu::BV32DataPacked* PX_RESTRICT & bv32PackedNodes, const float4* PX_RESTRICT & trimeshVerts, const uint4* PX_RESTRICT & trimeshTriIndices)
+{
+	uint4 result = readTriangleMesh(trimeshGeomPtr, bv32PackedNodes, trimeshVerts);
+
+	trimeshGeomPtr += sizeof(float4) * result.x;
+	trimeshTriIndices = reinterpret_cast<const uint4 *>(trimeshGeomPtr);
+
+	return result;
+}
+
+static PX_FORCE_INLINE __device__ uint4 readTriangleMesh(const PxU8 * PX_RESTRICT & trimeshGeomPtr, const float4* PX_RESTRICT & trimeshVerts, const uint4* PX_RESTRICT & trimeshTriIndices,
+	const uint4* PX_RESTRICT & trimeshTriAdjacencies)
+{
+	const Gu::BV32DataPacked* bv32PackedNodes;
+
+	uint4 result = readTriangleMesh(trimeshGeomPtr, bv32PackedNodes, trimeshVerts, trimeshTriIndices);
+
+	trimeshGeomPtr += sizeof(uint4)* result.y;
+
+	trimeshTriAdjacencies = reinterpret_cast<const uint4 *>(trimeshGeomPtr);
+
+	return result;
+}
+
+static PX_FORCE_INLINE __device__ uint4 readTriangleMesh(const PxU8 * PX_RESTRICT & trimeshGeomPtr, const float4* PX_RESTRICT & trimeshVerts, const uint4* PX_RESTRICT & trimeshTriIndices,
+	const uint4* PX_RESTRICT & trimeshTriAdjacencies, const PxU32* PX_RESTRICT & trimeshFaceRemap)
+{
+	uint4 result = readTriangleMesh(trimeshGeomPtr, trimeshVerts, trimeshTriIndices, trimeshTriAdjacencies);
+
+	trimeshGeomPtr += sizeof(uint4)* result.y;
+
+	trimeshFaceRemap = reinterpret_cast<const PxU32 *>(trimeshGeomPtr);
+
+	return result;
+}
+
+static __device__ void readTriangleMesh(const PxgShape& trimeshShape, PxgTriangleMesh& triangleMesh)
+{
+	const PxU8 * trimeshGeomPtr = reinterpret_cast<const PxU8 *>(trimeshShape.hullOrMeshPtr);
+
+	const uint4 nbVerts_nbTris_nbAdjVertsTotal_nbBv32TreeNodes = *reinterpret_cast<const uint4 *>(trimeshGeomPtr);
+	trimeshGeomPtr += sizeof(uint4);
+
+	triangleMesh.numVerts = nbVerts_nbTris_nbAdjVertsTotal_nbBv32TreeNodes.x;
+	triangleMesh.numTris = nbVerts_nbTris_nbAdjVertsTotal_nbBv32TreeNodes.y;
+	triangleMesh.numBv32TreeNodes = nbVerts_nbTris_nbAdjVertsTotal_nbBv32TreeNodes.z;
+	triangleMesh.nbPackedNodes = nbVerts_nbTris_nbAdjVertsTotal_nbBv32TreeNodes.w;
+
+	triangleMesh.bv32PackedNodes = reinterpret_cast<const Gu::BV32DataPacked*>(trimeshGeomPtr);
+	trimeshGeomPtr += sizeof(const Gu::BV32DataPacked)* nbVerts_nbTris_nbAdjVertsTotal_nbBv32TreeNodes.w;
+
+	triangleMesh.trimeshVerts = reinterpret_cast<const float4 *>(trimeshGeomPtr);
+	trimeshGeomPtr += sizeof(float4) * nbVerts_nbTris_nbAdjVertsTotal_nbBv32TreeNodes.x;
+
+	triangleMesh.indices = reinterpret_cast<const uint4 *>(trimeshGeomPtr);
+	trimeshGeomPtr += sizeof(uint4) * nbVerts_nbTris_nbAdjVertsTotal_nbBv32TreeNodes.y;
+
+	//const uint4* PX_RESTRICT trimeshTriAdjacencies = reinterpret_cast<const uint4 *>(trimeshGeomPtr);
+	trimeshGeomPtr += sizeof(uint4)* nbVerts_nbTris_nbAdjVertsTotal_nbBv32TreeNodes.y;
+
+	triangleMesh.trimeshFaceRemap = reinterpret_cast<const PxU32 *>(trimeshGeomPtr);
+	//trimeshGeomPtr += ((nbVerts_nbTris_nbAdjVertsTotal_nbBv32TreeNodes.y + 3)& (~3)) * sizeof(PxU32);
+	trimeshGeomPtr += nbVerts_nbTris_nbAdjVertsTotal_nbBv32TreeNodes.y * sizeof(PxU32);
+
+	//triMesh->mAccumulatedTrianglesRef
+	trimeshGeomPtr += (nbVerts_nbTris_nbAdjVertsTotal_nbBv32TreeNodes.x + 1) * sizeof(PxU32);
+
+	//triMesh->mTrianglesReferences
+	trimeshGeomPtr += (nbVerts_nbTris_nbAdjVertsTotal_nbBv32TreeNodes.y * 3) * sizeof(PxU32);
+
+	trimeshGeomPtr = (PxU8*)(((size_t)trimeshGeomPtr + 15) & ~15);
+
+
+	uint4 sdfDims = *reinterpret_cast<const uint4 *>(trimeshGeomPtr);
+	trimeshGeomPtr += sizeof(uint4);
+
+	PxU32 numSdfs = sdfDims.x*sdfDims.y*sdfDims.z;
+
+	triangleMesh.sdfDims = sdfDims;
+
+	if (numSdfs)
+	{
+		float4 meshLower_spacing = *reinterpret_cast<const float4 *>(trimeshGeomPtr);
+		trimeshGeomPtr += sizeof(float4);
+		uint4 sizeInfo = *reinterpret_cast<const uint4 *>(trimeshGeomPtr);
+		trimeshGeomPtr += sizeof(uint4);
+
+		triangleMesh.meshLower = PxVec3(meshLower_spacing.x, meshLower_spacing.y, meshLower_spacing.z);
+		triangleMesh.spacing = meshLower_spacing.w;
+
+		/*PxU32 subgridSize;
+		PxU32 unused0;
+		PxU32 unused1;
+		PxU32 numCoarseGridElements;*/
+		triangleMesh.subgridSize = sizeInfo.x;
+		triangleMesh.subgridsMinSdfValue = reinterpret_cast<PxReal&>(sizeInfo.y);
+		triangleMesh.subgridsMaxSdfValue = reinterpret_cast<PxReal&>(sizeInfo.z);
+		//triangleMesh.numCoarseGridElements = sizeInfo.w;
+
+		//triangleMesh.sdf = reinterpret_cast<const PxReal*>(trimeshGeomPtr);
+		triangleMesh.mTexObject = *reinterpret_cast<const CUtexObject*>(trimeshGeomPtr);
+		trimeshGeomPtr += sizeof(const CUtexObject);
+		triangleMesh.mTexObjectSparse = *reinterpret_cast<const CUtexObject*>(trimeshGeomPtr);
+		trimeshGeomPtr += sizeof(const CUtexObject);
+
+		triangleMesh.subgridStarts = reinterpret_cast<const PxU32 *>(trimeshGeomPtr);
+
+		trimeshGeomPtr += sizeof(PxU32) * sizeInfo.w;
+	}
+	else
+		triangleMesh.mTexObject = 0;
+}
+
+}
+#endif
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/triangletriangle.cuh
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/triangletriangle.cuh
@@ -0,0 +1,121 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __TRIANGLE_TRIANGLE_CUH__
+#define __TRIANGLE_TRIANGLE_CUH__
+
+#include "foundation/PxPlane.h"
+#include "foundation/PxVec3.h"
+
+struct Triangle
+{
+	physx::PxVec3 verts[3];
+	physx::PxPlane triPlane;
+	physx::PxVec3 centroid; //mShapeCoMInTrimeshVertexSpace
+};
+
+
+__device__ PxReal minProject(const PxPlane& plane, const Triangle& triangle)
+{
+	/*return PxMin(plane.distance(tet.verts[0]), PxMin(plane.distance(tet.verts[1]),
+		PxMin(plane.distance(tet.verts[2]), plane.distance(tet.verts[3]))));*/
+
+	return PxMin(plane.n.dot(triangle.verts[0]), PxMin(plane.n.dot(triangle.verts[1]),
+		plane.n.dot(triangle.verts[2]))) + plane.d;
+}
+
+
+__device__ inline PxReal testSeparatingAxis(const PxVec3& axis, const Triangle& triangle0, const Triangle& triangle1)
+{
+
+	PxReal min0, max0, min1, max1;
+	min0 = max0 = triangle0.verts[0].dot(axis);
+	min1 = max1 = triangle1.verts[0].dot(axis);
+	for (PxU32 i = 1; i < 3; ++i)
+	{
+		PxReal proj0 = triangle0.verts[i].dot(axis);
+		PxReal proj1 = triangle1.verts[i].dot(axis);
+
+		min0 = PxMin(proj0, min0);
+		max0 = PxMax(proj0, max0);
+		min1 = PxMin(proj1, min1);
+		max1 = PxMax(proj1, max1);
+	}
+
+	return PxMax(min1 - max0, min0 - max1);
+}
+
+template <bool TDoCross = true>
+__device__ inline PxReal satIntersect(const Triangle& triangle0, const Triangle& triangle1, const PxReal tolerance)
+{
+	PxReal sep = minProject(triangle0.triPlane, triangle1);
+	if (sep > tolerance)
+		return sep;
+
+	sep = PxMax(sep, minProject(triangle1.triPlane, triangle0));
+	if (sep > tolerance)
+		return sep;
+
+	if (TDoCross)
+	{
+		PxVec3 axes0[3];
+		PxVec3 axes1[3];
+
+		axes0[0] = triangle0.verts[1] - triangle0.verts[0];
+		axes0[1] = triangle0.verts[2] - triangle0.verts[0];
+		axes0[2] = triangle0.verts[2] - triangle0.verts[1];
+
+		axes1[0] = triangle1.verts[1] - triangle1.verts[0];
+		axes1[1] = triangle1.verts[2] - triangle1.verts[0];
+		axes1[2] = triangle1.verts[2] - triangle1.verts[1];
+
+		for (PxU32 i = 0; i < 3; ++i)
+		{
+			const PxVec3 axis0 = axes0[i];
+			for (PxU32 j = 0; j < 3; ++j)
+			{
+				const PxVec3 axis1 = axes1[j];
+				PxVec3 sepAxis = axis0.cross(axis1);
+				const PxReal magSq = sepAxis.magnitudeSquared();
+				if (magSq > 1e-5f)
+				{
+					sepAxis = sepAxis * (1.f / PxSqrt(magSq));
+					const PxReal tSep = testSeparatingAxis(sepAxis, triangle0, triangle1);
+					sep = PxMax(sep, tSep);
+					if (sep > tolerance)
+						return sep;
+
+				}
+			}
+		}
+	}
+
+	return sep;
+}
+
+#endif
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/trimeshCollision.cu
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/trimeshCollision.cu
@@ -0,0 +1,683 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "foundation/PxMath.h"
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxTransform.h"
+
+#include "geometry/PxGeometry.h"
+
+#include "PxgContactManager.h"
+#include "PxsContactManagerState.h"
+#include "PxgConvexConvexShape.h"
+#include "PxsMaterialCore.h"
+#include "PxgPersistentContactManifold.h"
+#include "PxsTransformCache.h"
+
+#include "PxContact.h"
+
+#include "GuBV32.h"
+
+#include "cudaNpCommon.h"
+
+#include "contactPatchUtils.cuh"
+#include "contactReduction.cuh"
+#include "utils.cuh"
+#include "manifold.cuh"
+#include "triangleMesh.cuh"
+
+#define PLANE_TRI_MAX_CONTACTS 6
+
+using namespace physx;
+
+extern "C" __host__ void initNarrowphaseKernels22() {}
+
+//each block deal with one test
+extern "C" __global__ 
+__launch_bounds__(1024, 1) 
+void trimeshPlaneNarrowphase(
+	const PxReal toleranceLength,
+	const PxgContactManagerInput* PX_RESTRICT cmInputs,
+	PxsContactManagerOutput* PX_RESTRICT cmOutputs,
+	PxgShape* PX_RESTRICT shapes,
+	const PxsCachedTransform* PX_RESTRICT transformCache,
+	const PxReal* contactDistance,
+	const PxsMaterialData* PX_RESTRICT materials,
+	PxgPersistentContactMultiManifold* PX_RESTRICT multiManifold,
+	PxU8* PX_RESTRICT contactStream,
+	PxU8* PX_RESTRICT patchStream,
+	PxgPatchAndContactCounters* PX_RESTRICT patchAndContactCounters,
+	PxU32* PX_RESTRICT touchChangeFlags,
+	PxU32* PX_RESTRICT patchChangeFlags,
+	PxU8* PX_RESTRICT startContactPatches,
+	PxU8* PX_RESTRICT startContactPoints,
+	PxU8* PX_RESTRICT startContactForces,
+	PxU32 patchBytesLimit,
+	PxU32 contactBytesLimit,
+	PxU32 forceBytesLimit,
+	const PxReal clusterTolerance)
+{
+	__shared__ PxU32 sContacts[(sizeof(PxVec3)/sizeof(PxU32)) * (WARP_SIZE + 1) * PLANE_TRI_MAX_CONTACTS];
+	PxVec3* contacts = reinterpret_cast<PxVec3*>(sContacts);
+	__shared__ PxReal separations[(WARP_SIZE+1)*PLANE_TRI_MAX_CONTACTS];
+	__shared__ PxU32 counters[WARP_SIZE];
+
+	__shared__ PxU32 retainedCount;
+
+	__shared__ bool doFullContactGen;
+
+	if (threadIdx.x == 0)
+	{
+		retainedCount = 0;
+	}
+
+	const PxU32 workIndex = blockIdx.x;
+	const PxU32 threadIndexInWarp = threadIdx.x & 31;
+
+	PxgShape trimeshShape, planeShape;
+	PxU32 trimeshCacheRef, planeCacheRef;
+	LoadShapePairWarp<PxGeometryType::eTRIANGLEMESH, PxGeometryType::ePLANE>(cmInputs, workIndex, shapes,
+		trimeshShape, trimeshCacheRef, planeShape, planeCacheRef);
+
+	PxsCachedTransform trimeshTransformCache = transformCache[trimeshCacheRef];
+	PxsCachedTransform planeTransformCache = transformCache[planeCacheRef];
+	const PxReal cDistance = contactDistance[trimeshCacheRef] + contactDistance[planeCacheRef];
+
+	PxTransform planeTransform = planeTransformCache.transform;
+	PxTransform trimeshTransform = trimeshTransformCache.transform;
+
+	// Geometries : Triangle Mesh
+	const PxU8 * trimeshGeomPtr = reinterpret_cast<const PxU8 *>(trimeshShape.hullOrMeshPtr);
+
+	const Gu::BV32DataPacked* nodes;
+	const float4 * trimeshVerts;
+	uint4 counts = readTriangleMesh(trimeshGeomPtr, nodes, trimeshVerts);
+	const PxU32 numVerts = counts.x;
+	
+	const PxU32 numIterationsRequired = (numVerts + blockDim.x - 1) / blockDim.x;
+
+
+	PxVec3 min(PX_MAX_F32), max(-PX_MAX_F32);
+
+	if (threadIndexInWarp < nodes->mNbNodes)
+	{
+		const PxVec4 min4 = nodes->mMin[threadIndexInWarp];
+		const PxVec4 max4 = nodes->mMax[threadIndexInWarp];
+
+		min = PxVec3(min4.x, min4.y, min4.z);
+		max = PxVec3(max4.x, max4.y, max4.z);
+	}
+
+	minIndex(min.x, FULL_MASK, min.x); minIndex(min.y, FULL_MASK, min.y); minIndex(min.z, FULL_MASK, min.z);
+	maxIndex(max.x, FULL_MASK, max.x); maxIndex(max.y, FULL_MASK, max.y); maxIndex(max.z, FULL_MASK, max.z);
+
+	const float4 extents4_f = make_float4(max.x - min.x, max.y - min.y, max.z - min.z, 0.f)*0.5f;
+
+	PxReal minMargin = calculatePCMConvexMargin(extents4_f, trimeshShape.scale.scale, toleranceLength) * 0.5f;
+	const PxReal ratio = 0.1f;
+	const PxReal breakingThresholdRatio = 0.5f;
+
+	bool lostContacts = false;
+
+	PxTransform trimeshToPlane = planeTransform.transformInv(trimeshTransform);
+
+	const PxU32 warpIndex = threadIdx.x / 32;
+
+	if (warpIndex == 0)
+	{
+		const bool invalidate = invalidateManifold(trimeshToPlane, multiManifold[workIndex], minMargin, ratio);
+
+		if (!invalidate)
+		{
+			const PxReal projectBreakingThreshold = minMargin * breakingThresholdRatio;
+
+			lostContacts = refreshManifolds(
+				trimeshToPlane,
+				projectBreakingThreshold,
+				multiManifold + workIndex
+			);
+		}
+
+		bool fullContactGen = invalidate || lostContacts;
+
+		if (threadIdx.x == 0)
+			doFullContactGen = fullContactGen;
+	}
+
+	
+	//const PxU32 numThreadsRequired = numIterationsRequired * blockDim.x;
+
+
+
+	PxVec3 worldNormal = planeTransform.q.getBasisVector0();
+
+	__syncthreads();
+
+	if (doFullContactGen)
+	{
+		for (PxU32 vertInd = 0; vertInd < numIterationsRequired; vertInd++)
+		{
+
+			PxU32 i = vertInd * blockDim.x + threadIdx.x;
+			bool hasContact = false;
+			PxVec3 worldPoint;
+			PxReal separation = PX_MAX_F32;
+
+			if (i < numVerts)
+			{
+				float4 point = trimeshVerts[i];
+
+				PxVec3 tp = PxLoad3(point);
+				PxVec3 p = vertex2Shape(tp, trimeshShape.scale.scale, trimeshShape.scale.rotation);
+
+				//v in plane space
+				const PxVec3 pInPlaneSpace = trimeshToPlane.transform(p);
+
+				//separation / penetration
+				//separation = pInPlaneSpace.x;// FSub(V3GetX(sphereCenterInPlaneSpace), radius);
+
+
+				if (cDistance >= pInPlaneSpace.x)
+				{
+					//get the plane normal
+					separation = pInPlaneSpace.x;
+					worldPoint = trimeshTransform.transform(p);
+					hasContact = true;
+				}
+			}
+
+			int mask = __ballot_sync(FULL_MASK, hasContact);
+
+			mask = contactReduce<true, true, PLANE_TRI_MAX_CONTACTS, false>(worldPoint, separation, worldNormal, mask, clusterTolerance);
+
+			counters[warpIndex] = __popc(mask);
+
+			hasContact = mask & (1 << threadIndexInWarp);
+
+			__syncthreads();
+
+			PxU32 counter = counters[threadIndexInWarp];
+
+			PxU32 contactCount = warpScan<AddOpPxU32, PxU32>(FULL_MASK, counter);
+
+			//exclusive runsum
+			PxU32 contactWarpOffset = contactCount - counter;
+
+
+			contactWarpOffset = __shfl_sync(FULL_MASK, contactWarpOffset, warpIndex);
+
+
+			const PxU32 offset = warpScanExclusive(mask, threadIndexInWarp);
+
+			if (hasContact)
+			{
+				const PxU32 index = offset + contactWarpOffset + retainedCount;
+				contacts[index] = worldPoint;
+				separations[index] = separation;
+			}
+
+
+			const PxU32 totalContacts = __shfl_sync(FULL_MASK, contactCount, 31) + retainedCount;
+			__syncthreads();
+
+			//Now do another loop with warp 0 doing reduction
+
+			if (warpIndex == 0)
+			{
+				hasContact = false;
+				for (PxU32 ind = 0; ind < totalContacts;)
+				{
+					PxU32 readMask = ~(__ballot_sync(FULL_MASK, hasContact));
+
+					if (!hasContact)
+					{
+						PxU32 readIndex = warpScanExclusive(readMask, threadIndexInWarp) + ind;
+
+						if (readIndex < totalContacts)
+						{
+							worldPoint = contacts[readIndex];
+							separation = separations[readIndex];
+							hasContact = true;
+						}
+					}
+
+					mask = __ballot_sync(FULL_MASK, hasContact);
+					ind += __popc(readMask);
+
+					mask = contactReduce<true, true, PLANE_TRI_MAX_CONTACTS, false>(worldPoint, separation, worldNormal, mask, clusterTolerance);
+
+					hasContact = mask & (1 << threadIndexInWarp);
+				}
+
+				__syncwarp();
+				retainedCount = __popc(__ballot_sync(FULL_MASK, hasContact));				
+
+				if (hasContact)
+				{
+					PxU32 offset = warpScanExclusive(mask, threadIndexInWarp);
+					contacts[offset] = worldPoint;
+					separations[offset] = separation;
+				}
+			}
+
+			__syncthreads();
+
+		}
+
+		//Store to PCM...
+		if (warpIndex == 0)
+		{
+			PxgPersistentContactMultiManifold& manifold = multiManifold[workIndex];
+			if (retainedCount)
+			{
+				if (threadIdx.x == 0)
+				{
+					manifold.mNbManifolds = 1;
+
+					manifold.mNbContacts[0] = retainedCount;
+
+					manifold.mRelativeTransform = trimeshToPlane;
+				}
+
+				if (threadIdx.x < retainedCount)
+				{
+					
+					PxgContact& contact = manifold.mContacts[0][threadIdx.x];
+					contact.pointA = trimeshTransform.transformInv(contacts[threadIdx.x]);
+					PxVec3 planePoint = planeTransform.transformInv(contacts[threadIdx.x]);
+					planePoint.x = 0.f;// separations[threadIdx.x];
+					contact.pointB = planePoint;
+					contact.normal = PxVec3(1.f, 0.f, 0.f);
+					contact.penetration = separations[threadIdx.x];
+					contact.triIndex = 0xFFFFFFFF;
+				}
+			}
+			else
+				manifold.mNbManifolds = 0;
+		}
+
+	}
+	else
+	{
+		//Load contacts out of PCM...
+		if (warpIndex == 0)
+		{
+			PxgPersistentContactMultiManifold& manifold = multiManifold[workIndex];
+
+			if (manifold.mNbManifolds != 0)
+			{
+				if (threadIdx.x < manifold.mNbContacts[0])
+				{
+
+					PxgContact& contact = manifold.mContacts[0][threadIdx.x];
+					contacts[threadIdx.x] = trimeshTransform.transform(contact.pointA);
+					separations[threadIdx.x] = contact.penetration;
+				}
+
+				if (threadIdx.x == 0)
+					retainedCount = manifold.mNbContacts[0];
+			}
+		}
+
+		__syncthreads();
+	}
+	
+	
+
+	//Retained counts stores the set of contacts we kept, which are stored in the first N 
+	//elements of contacts and separations buffer
+
+	if (warpIndex == 0)
+	{
+
+		PxU32 nbContacts = retainedCount;
+
+		PxU32 contactByteOffset = 0xFFFFFFFF;
+		PxU32 forceAndIndiceByteOffset = 0xFFFFFFFF;
+
+		if (threadIndexInWarp == 0)
+		{
+			if (nbContacts)
+			{
+				contactByteOffset = atomicAdd(&(patchAndContactCounters->contactsBytes), sizeof(PxContact) * nbContacts);
+				forceAndIndiceByteOffset = atomicAdd(&(patchAndContactCounters->forceAndIndiceBytes), sizeof(PxU32) * nbContacts);
+
+				if ((contactByteOffset + sizeof(PxContact)* nbContacts) > contactBytesLimit)
+				{
+					patchAndContactCounters->setOverflowError(PxgPatchAndContactCounters::CONTACT_BUFFER_OVERFLOW);
+					contactByteOffset = 0xFFFFFFFF; //overflow
+				}
+				
+				if ((forceAndIndiceByteOffset + sizeof(PxU32)* nbContacts) > forceBytesLimit)
+				{
+					patchAndContactCounters->setOverflowError(PxgPatchAndContactCounters::FORCE_BUFFER_OVERFLOW);
+					forceAndIndiceByteOffset = 0xFFFFFFFF; //overflow
+				}
+			}
+		}
+
+		contactByteOffset = __shfl_sync(FULL_MASK, contactByteOffset, 0);
+
+
+		PxsContactManagerOutput* output = cmOutputs + workIndex;
+		//write point and penetration to the contact stream
+		if (contactByteOffset != 0xFFFFFFFF)
+		{
+			//*((float4 *)(contactStream + contactByteOffset)) = pointPen;
+
+			//output->contactForces = reinterpret_cast<PxReal*>(startContactForces + forceAndIndiceByteOffset);
+			//output->contactPoints = startContactPoints + contactByteOffset;
+			
+			float4* baseContactStream = (float4*)(contactStream + contactByteOffset);
+			if (threadIndexInWarp < nbContacts)
+			{
+				float4& outPointPen = baseContactStream[threadIndexInWarp];
+				outPointPen = make_float4(contacts[threadIndexInWarp].x, contacts[threadIndexInWarp].y,
+					contacts[threadIndexInWarp].z, separations[threadIndexInWarp]);
+			}
+
+			if (threadIndexInWarp == 0)
+			{
+				output->contactForces = reinterpret_cast<PxReal*>(startContactForces + forceAndIndiceByteOffset);
+				output->contactPoints = startContactPoints + contactByteOffset;
+			}
+		}
+		else 
+		{
+			if (threadIndexInWarp == 0)
+			{
+				output->contactForces = NULL;
+				output->contactPoints = NULL;
+			}
+			nbContacts = 0;
+		}
+
+		if (threadIndexInWarp == 0)
+		{
+			PxU32 patchIndex = registerContactPatch(cmOutputs, patchAndContactCounters, touchChangeFlags, patchChangeFlags, startContactPatches, patchBytesLimit, workIndex, nbContacts);
+
+			if (nbContacts == 0)
+				(cmOutputs + workIndex)->contactPatches = 0;
+
+			insertIntoPatchStream(materials, patchStream, planeShape, trimeshShape, patchIndex, worldNormal, nbContacts);
+		}
+	}
+}
+
+
+
+//each block deal with one test
+extern "C" __global__
+__launch_bounds__(1024, 1)
+void trimeshHeightfieldNarrowphase(
+	const PxReal toleranceLength,
+	const PxgContactManagerInput* PX_RESTRICT cmInputs,
+	PxsContactManagerOutput* PX_RESTRICT cmOutputs,
+	PxgShape* PX_RESTRICT shapes,
+	const PxsCachedTransform* PX_RESTRICT transformCache,
+	const PxReal* contactDistance,
+	const PxsMaterialData* PX_RESTRICT materials,
+	PxU8* PX_RESTRICT contactStream,
+	PxU8* PX_RESTRICT patchStream,
+	PxgPatchAndContactCounters* PX_RESTRICT patchAndContactCounters,
+	PxU32* PX_RESTRICT touchChangeFlags,
+	PxU32* PX_RESTRICT patchChangeFlags,
+	PxU8* PX_RESTRICT startContactPatches,
+	PxU8* PX_RESTRICT startContactPoints,
+	PxU8* PX_RESTRICT startContactForces,
+	PxU32 patchBytesLimit,
+	PxU32 contactBytesLimit,
+	PxU32 forceBytesLimit,
+	const PxReal clusterBias)
+{
+	__shared__ char sContacts[sizeof(PxVec3) * (WARP_SIZE + 1) * PLANE_TRI_MAX_CONTACTS];
+	PxVec3* contacts = reinterpret_cast<PxVec3*>(sContacts);
+
+	__shared__ PxReal separations[(WARP_SIZE + 1)*PLANE_TRI_MAX_CONTACTS];
+	__shared__ PxU32 counters[WARP_SIZE];
+
+	__shared__ PxU32 retainedCount;
+
+	if (threadIdx.x == 0)
+	{
+		retainedCount = 0;
+	}
+
+	__syncthreads();
+
+	const PxU32 workIndex = blockIdx.x;
+
+	PxgShape trimeshShape, heightfieldShape;
+	PxU32 trimeshCacheRef, heightfieldCacheRef;
+	LoadShapePairWarp<PxGeometryType::eTRIANGLEMESH, PxGeometryType::eHEIGHTFIELD>(cmInputs, workIndex, shapes,
+		trimeshShape, trimeshCacheRef, heightfieldShape, heightfieldCacheRef);
+
+	PxsCachedTransform trimeshTransformCache = transformCache[trimeshCacheRef];
+	PxsCachedTransform heightfieldTransformCache = transformCache[heightfieldCacheRef];
+	const PxReal cDistance = contactDistance[trimeshCacheRef] + contactDistance[heightfieldCacheRef];
+
+	PxTransform trimeshTransform = trimeshTransformCache.transform;
+	PxTransform heightfieldTransform = heightfieldTransformCache.transform;
+
+	// Geometries : Triangle Mesh
+	const PxU8 * trimeshGeomPtr = reinterpret_cast<const PxU8 *>(trimeshShape.hullOrMeshPtr);
+
+	const Gu::BV32DataPacked* nodes;
+	const float4 * trimeshVerts;
+	uint4 counts = readTriangleMesh(trimeshGeomPtr, nodes, trimeshVerts);
+	const PxU32 numVerts = counts.x;
+
+	const PxU32 numIterationsRequired = (numVerts + blockDim.x - 1) / blockDim.x;
+
+	//const PxU32 numThreadsRequired = numIterationsRequired * blockDim.x;
+
+	const PxU32 threadIndexInWarp = threadIdx.x & 31;
+	const PxU32 warpIndex = threadIdx.x / 32;
+
+	PxTransform trimeshToHeightfield = heightfieldTransform.transformInv(trimeshTransform);
+
+	PxVec3 worldNormal = heightfieldTransform.q.getBasisVector0();
+
+	for (PxU32 ind = 0; ind < numIterationsRequired; ind++)
+	{
+
+		PxU32 i = ind * blockDim.x + threadIdx.x;
+		bool hasContact = false;
+		PxVec3 worldPoint;
+		PxReal separation;
+
+		if (i < numVerts)
+		{
+			float4 point = trimeshVerts[i];
+
+			PxVec3 tp = PxLoad3(point);
+			PxVec3 p = vertex2Shape(tp, trimeshShape.scale.scale, trimeshShape.scale.rotation);
+
+			//v in plane space
+			const PxVec3 pInHeightfieldSpace = trimeshToHeightfield.transform(p);
+
+			//separation / penetration
+			separation = pInHeightfieldSpace.x;
+
+
+			if (cDistance >= pInHeightfieldSpace.x)
+			{
+				//get the plane normal
+				worldPoint = trimeshTransform.transform(p);
+				hasContact = true;
+			}
+		}
+
+		int mask = __ballot_sync(FULL_MASK, hasContact);
+
+		mask = contactReduce<true, true, PLANE_TRI_MAX_CONTACTS>(worldPoint, separation, worldNormal, mask, clusterBias);
+
+		counters[warpIndex] = __popc(mask);
+
+
+		__syncthreads();
+
+		PxU32 contactCount = warpScan<AddOpPxU32, PxU32>(FULL_MASK, counters[threadIndexInWarp]);
+
+		//exclusive runsum
+		PxU32 contactWarpOffset = contactCount - counters[threadIndexInWarp];
+
+
+		contactWarpOffset = __shfl_sync(FULL_MASK, contactWarpOffset, warpIndex);
+
+
+		const PxU32 offset = warpScanExclusive(mask, threadIndexInWarp);
+
+		if (hasContact)
+		{
+			const PxU32 index = offset + contactWarpOffset + retainedCount;
+			contacts[index] = worldPoint;
+			separations[index] = separation;
+		}
+
+		__syncthreads();
+
+		const PxU32 totalContacts = __shfl_sync(FULL_MASK, contactCount, 31) + retainedCount;
+
+
+		//Now do another loop with warp 0 doing reduction
+
+		if (warpIndex == 0)
+		{
+			hasContact = false;
+			for (PxU32 ind = 0; ind < totalContacts;)
+			{
+				PxU32 readMask = ~(__ballot_sync(FULL_MASK, hasContact));
+
+				if (!hasContact)
+				{
+					PxU32 readIndex = warpScanExclusive(readMask, threadIndexInWarp);
+
+					if ((ind + readIndex) < totalContacts)
+					{
+						worldPoint = contacts[readIndex];
+						separation = separations[readIndex];
+						hasContact = true;
+					}
+				}
+
+				mask = __ballot_sync(FULL_MASK, hasContact);
+				ind += __popc(readMask);
+				mask = contactReduce<true, true, PLANE_TRI_MAX_CONTACTS>(worldPoint, separation, worldNormal, mask, clusterBias);
+
+				hasContact = mask & (1 << threadIndexInWarp);
+			}
+
+			retainedCount = __popc(__ballot_sync(FULL_MASK, hasContact));
+
+			if (hasContact)
+			{
+				PxU32 offset = warpScanExclusive(mask, threadIndexInWarp);
+				contacts[offset] = worldPoint;
+				separations[offset] = separation;
+			}
+		}
+
+		__syncthreads();
+
+	}
+
+
+
+	//Retained counts stores the set of contacts we kept, which are stored in the first N 
+	//elements of contacts and separations buffer
+
+	if (warpIndex == 0)
+	{
+
+		const PxU32 nbContacts = retainedCount;
+
+		PxU32 contactByteOffset = 0xFFFFFFFF;
+		PxU32 forceAndIndiceByteOffset = 0xFFFFFFFF;
+
+		if (threadIndexInWarp == 0)
+		{
+			if (nbContacts)
+			{
+				contactByteOffset = atomicAdd(&(patchAndContactCounters->contactsBytes), sizeof(PxContact) * nbContacts);
+				forceAndIndiceByteOffset = atomicAdd(&(patchAndContactCounters->forceAndIndiceBytes), sizeof(PxU32) * nbContacts);
+
+				if ((contactByteOffset + sizeof(PxContact)) > contactBytesLimit)
+				{
+					patchAndContactCounters->setOverflowError(PxgPatchAndContactCounters::CONTACT_BUFFER_OVERFLOW);
+					contactByteOffset = 0xFFFFFFFF; //overflow
+				}
+				else if ((forceAndIndiceByteOffset + sizeof(PxU32)) > forceBytesLimit)
+				{
+					patchAndContactCounters->setOverflowError(PxgPatchAndContactCounters::FORCE_BUFFER_OVERFLOW);
+					forceAndIndiceByteOffset = 0xFFFFFFFF; //overflow
+				}
+			}
+		}
+
+		contactByteOffset = __shfl_sync(FULL_MASK, contactByteOffset, 0);
+
+
+		PxsContactManagerOutput* output = cmOutputs + workIndex;
+		//write point and penetration to the contact stream
+		if (contactByteOffset != 0xFFFFFFFF)
+		{
+			//*((float4 *)(contactStream + contactByteOffset)) = pointPen;
+
+			//output->contactForces = reinterpret_cast<PxReal*>(startContactForces + forceAndIndiceByteOffset);
+			//output->contactPoints = startContactPoints + contactByteOffset;
+
+			float4* baseContactStream = (float4*)(contactStream + contactByteOffset);
+			if (threadIndexInWarp < nbContacts)
+			{
+				float4& outPointPen = baseContactStream[threadIndexInWarp];
+				outPointPen = make_float4(contacts[threadIndexInWarp].x, contacts[threadIndexInWarp].y,
+					contacts[threadIndexInWarp].z, separations[threadIndexInWarp]);
+			}
+
+			if (threadIndexInWarp == 0)
+			{
+				output->contactForces = reinterpret_cast<PxReal*>(startContactForces + forceAndIndiceByteOffset);
+				output->contactPoints = startContactPoints + contactByteOffset;
+			}
+		}
+		else if (threadIndexInWarp == 0)
+		{
+			output->contactForces = NULL;
+			output->contactPoints = NULL;
+		}
+
+		if (threadIndexInWarp == 0)
+		{
+			PxU32 patchIndex = registerContactPatch(cmOutputs, patchAndContactCounters, touchChangeFlags, patchChangeFlags, startContactPatches, patchBytesLimit, workIndex, nbContacts);
+			
+			insertIntoPatchStream(materials, patchStream, heightfieldShape, trimeshShape, patchIndex, worldNormal, nbContacts);
+		}
+	}
+}
+
--- a/engine/third_party/physx/source/gpunarrowphase/src/CUDA/warpHelpers.cuh
+++ b/engine/third_party/physx/source/gpunarrowphase/src/CUDA/warpHelpers.cuh
@@ -0,0 +1,248 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+
+#ifndef __SCANWARP_CUH__
+#define __SCANWARP_CUH__
+
+/* Scan&Reduce operators */
+
+template <typename T>
+struct AddOP
+{
+	static __device__ T apply(T a, T b) { return a + b; }
+	static __device__ T identity() { return T(); }
+};
+
+template <typename T>
+struct AndOP
+{
+	static __device__ T apply(T a, T b) { return a & b; }
+	static __device__ T identity() { return T(); }
+};
+
+typedef AddOP<unsigned int> AddOPu;
+typedef AddOP<float> AddOPf;
+
+template <typename T>
+struct MaxOPu
+{
+	static __device__ T apply(T a, T b) { return a > b ? a : b; }
+	static __device__ T identity() { return T(); }
+};
+
+struct MinOPf
+{
+	static __device__ float apply(float a, float b) { return fminf(a, b); }
+	static __device__ float applyIdx(float a, float b, unsigned int &oindex, unsigned int aindex, unsigned int bindex)
+	{
+		if(b < a)
+		{
+			oindex = bindex;
+			return b;
+		}
+		else
+		{
+			oindex = aindex;
+			return a;
+		}
+	}
+	static __device__ float identity() { return +FLT_MAX; }
+};
+
+struct MaxOPf
+{
+	static __device__ float apply(float a, float b) { return fmaxf(a, b); }
+	static __device__ float applyIdx(float a, float b, unsigned int &oindex, unsigned int aindex, unsigned int bindex)
+	{
+		if(b > a)
+		{
+			oindex = bindex;
+			return b;
+		}
+		else
+		{
+			oindex = aindex;
+			return a;
+		}
+	}
+
+	static __device__ float identity() { return -FLT_MAX; }
+};
+
+/* Reduce */
+
+template <typename T, class OP>
+__device__ static inline void reduceWarp(volatile T* sdata)
+{
+	unsigned int idx = threadIdx.x;
+	if ((idx & (WARP_SIZE-1)) < 16)
+	{
+		sdata[idx] = OP::apply(sdata[idx], sdata[idx + 16]);
+		sdata[idx] = OP::apply(sdata[idx], sdata[idx +  8]);
+		sdata[idx] = OP::apply(sdata[idx], sdata[idx +  4]);
+		sdata[idx] = OP::apply(sdata[idx], sdata[idx +  2]);
+		sdata[idx] = OP::apply(sdata[idx], sdata[idx +  1]);
+	}
+}
+
+template <typename T, class OP>
+__device__ static inline void reduceWarp(volatile T* sdata, unsigned int idx)
+{
+	sdata[idx] = OP::apply(sdata[idx], sdata[idx + 16]); 
+	sdata[idx] = OP::apply(sdata[idx], sdata[idx +  8]);
+	sdata[idx] = OP::apply(sdata[idx], sdata[idx +  4]);
+	sdata[idx] = OP::apply(sdata[idx], sdata[idx +  2]);
+	sdata[idx] = OP::apply(sdata[idx], sdata[idx +  1]);
+}
+
+template <typename T, class OP>
+__device__ static inline void reduceHalfWarp(volatile T* sdata)
+{
+	unsigned int idx = threadIdx.x;
+	sdata[idx] = OP::apply(sdata[idx], sdata[idx +  8]);
+	sdata[idx] = OP::apply(sdata[idx], sdata[idx +  4]);
+	sdata[idx] = OP::apply(sdata[idx], sdata[idx +  2]);
+	sdata[idx] = OP::apply(sdata[idx], sdata[idx +  1]);
+}
+
+template <typename T, typename TIndex, class OP>
+__device__ static inline void reduceWarpKeepIndex(volatile T* sdata, volatile TIndex* sindices, unsigned int idx)
+{
+	TIndex ia, ib, oindex;
+
+	if ((idx & (WARP_SIZE-1)) < 16)
+	{
+		ia = sindices[idx];
+		ib = sindices[idx + 16];
+
+		sdata[idx] = OP::applyIdx(sdata[idx], sdata[idx + 16], oindex, ia, ib); 
+		sindices[idx] = oindex;
+	}
+
+	if ((idx & (WARP_SIZE-1)) < 8)
+	{
+		ia = sindices[idx];
+		ib = sindices[idx + 8];
+
+		sdata[idx] = OP::applyIdx(sdata[idx], sdata[idx + 8], oindex, ia, ib); 
+		sindices[idx] = oindex;
+	}
+
+	if ((idx & (WARP_SIZE-1)) < 4)
+	{
+		ia = sindices[idx];
+		ib = sindices[idx + 4];
+
+		sdata[idx] = OP::applyIdx(sdata[idx], sdata[idx + 4], oindex, ia, ib); 
+		sindices[idx] = oindex;
+	}
+
+	if ((idx & (WARP_SIZE-1)) < 2)
+	{
+		ia = sindices[idx];
+		ib = sindices[idx + 2];
+
+		sdata[idx] = OP::applyIdx(sdata[idx], sdata[idx + 2], oindex, ia, ib); 
+		sindices[idx] = oindex;
+	}
+
+	if ((idx & (WARP_SIZE-1)) < 1)
+	{
+		ia = sindices[idx];
+		ib = sindices[idx + 1];
+
+		sdata[idx] = OP::applyIdx(sdata[idx], sdata[idx + 1], oindex, ia, ib); 
+		sindices[idx] = oindex;
+	}
+}
+
+/* Scan */
+
+template <typename T, typename OP>
+__device__ static inline void scanWarp(unsigned int scanIdx, volatile T* sdata)
+{
+	sdata[scanIdx] = OP::apply(sdata[scanIdx], sdata[scanIdx -  1]); 
+	sdata[scanIdx] = OP::apply(sdata[scanIdx], sdata[scanIdx -  2]); 
+	sdata[scanIdx] = OP::apply(sdata[scanIdx], sdata[scanIdx -  4]); 
+	sdata[scanIdx] = OP::apply(sdata[scanIdx], sdata[scanIdx -  8]); 
+	sdata[scanIdx] = OP::apply(sdata[scanIdx], sdata[scanIdx - 16]); 
+}
+
+/// The number of warp scan steps
+#define STEPS LOG2_WARP_SIZE
+
+// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+#define SHFL_MASK ((-1 << STEPS) & 31) << 8
+
+template <>
+__device__ inline void scanWarp<PxU32, AddOP<PxU32> >(unsigned int scanIdx, volatile PxU32 * sdata)
+{
+	PxU32 input = sdata[scanIdx];
+
+	// Iterate scan steps
+	#pragma unroll
+	for (int STEP = 0; STEP < STEPS; STEP++)
+	{
+		asm(
+			"{"
+			"  .reg .u32 r0;"
+			"  .reg .pred p;"
+			"  shfl.up.b32 r0|p, %1, %2, %3;"
+			"  @p add.u32 r0, r0, %4;"
+			"  mov.u32 %0, r0;"
+			"}"
+			: "=r"(input) : "r"(input), "r"(1 << STEP), "r"(SHFL_MASK), "r"(input));
+	}
+	sdata[scanIdx] = input;
+}
+
+template <>
+__device__ inline void scanWarp<int, AddOP<int> >(unsigned int scanIdx, volatile int* sdata)
+{
+	PxU32 input = sdata[scanIdx];
+
+	// Iterate scan steps
+	#pragma unroll
+	for (int STEP = 0; STEP < STEPS; STEP++)
+	{
+		asm(
+			"{"
+			"  .reg .u32 r0;"
+			"  .reg .pred p;"
+			"  shfl.up.b32 r0|p, %1, %2, %3;"
+			"  @p add.u32 r0, r0, %4;"
+			"  mov.u32 %0, r0;"
+			"}"
+			: "=r"(input) : "r"(input), "r"(1 << STEP), "r"(SHFL_MASK), "r"(input));
+	}
+	sdata[scanIdx] = input;
+}
+
+
+#endif
--- a/engine/third_party/physx/source/gpunarrowphase/src/PxgGeometryManager.cpp
+++ b/engine/third_party/physx/source/gpunarrowphase/src/PxgGeometryManager.cpp
@@ -0,0 +1,772 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "PxgGeometryManager.h"
+
+#include "PxsHeapMemoryAllocator.h"
+#include "foundation/PxAssert.h"
+#include "foundation/PxBasicTemplates.h"
+#include "foundation/PxMemory.h"
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxVec3.h"
+
+#include "PxgCopyManager.h"
+#include "PxgCudaUtils.h"
+#include "PxgHeapMemAllocator.h"
+
+#include "GuBV32.h"
+#include "convex/GuConvexMesh.h"
+#include "mesh/GuTriangleMesh.h"
+#include "hf/GuHeightField.h"
+#include "GuSDF.h"
+#include "cudaNpCommon.h"
+
+#include "cudamanager/PxCudaContextManager.h"
+#include "cudamanager/PxCudaContext.h"
+#include <vector_types.h>
+#include <vector_functions.h>
+
+// AD: PxgGeometryManager manages the CPU/GPU data transfers and the lifetime of collision geometries: Convex Hulls, Trimeshes, SDFs, Heightfields.
+
+using namespace physx;
+using namespace Gu;
+
+// forward declarations of static functions
+static PxU64 computeBoxHullByteSize();
+static PxU64 computeHullByteSize(const ConvexHullData& hull, PxU32& numPolyVertices);
+static PxU64 computeTriMeshByteSize(const TriangleMesh& triMesh);
+static PxU64 computeHeightfieldByteSize(const HeightFieldData& hf);
+static void layOutBoxHull(void* mem);
+static void layOutHull(void* mem, const ConvexHullData& hull, PxU32 numPolyVertices);
+static PxgMeshTextureData layOutTriMesh(void* mem, const TriangleMesh& triMesh, CUstream stream);
+static void layOutHeightfield(void* mem, const HeightFieldData& hf);
+template<typename T> static void createTextureObject(CUarray_format format, CUtexObject*& texture, CUarray& cuArray, PxU32 width, PxU32 height, PxU32 depth, const T* data, CUstream stream);
+
+// PxgGeometryManager definitions
+PxgGeometryManager::PxgGeometryManager(PxgHeapMemoryAllocatorManager* heapMemoryManager): 
+	mDeviceMemoryAllocator(heapMemoryManager->mDeviceMemoryAllocators),
+	mPinnedMemoryAllocator(static_cast<PxgHeapMemoryAllocator*>(heapMemoryManager->mMappedMemoryAllocators)),
+	mPinnedMemoryBasePtr(NULL),
+	mPinnedHostMemoryRequirements(0),
+	mFreeGeometryIndices(mGeometryData),
+	mBoxHullIdx(0xFFffFFff)
+{
+}
+
+PxgGeometryManager::~PxgGeometryManager()
+{
+	if (mBoxHullIdx != 0xFFFFFFFF)
+		mDeviceMemoryAllocator->deallocate(reinterpret_cast<void*>(getGeometryDevPtrByIndex(mBoxHullIdx)));
+
+	if (mPinnedMemoryBasePtr)
+	{
+		mPinnedMemoryAllocator->deallocate(mPinnedMemoryBasePtr);
+		mPinnedMemoryBasePtr = NULL;
+	}
+}
+
+PxU32 PxgGeometryManager::addGeometryInternal(PxU64 byteSize, const void* geomPtr, UploadGeometryType::Enum type, PxU32 numPolyVertices /*= 0*/)
+{
+	byteSize = (byteSize + 255) & ~255;
+
+	mPinnedHostMemoryRequirements += byteSize;
+
+	PxU32 idx = mFreeGeometryIndices.getFreeIndex();
+	void* devicePtr = mDeviceMemoryAllocator->allocate(byteSize, PxsHeapStats::eNARROWPHASE, PX_FL);
+
+	PxU32 copyIndex = mScheduledCopies.size();
+
+	ScheduledCopyData scheduledCopy;
+	scheduledCopy.mHullOrTrimeshIdx = idx;
+	scheduledCopy.mGeometryPtr = geomPtr;
+	scheduledCopy.mType = type;
+	scheduledCopy.mNumPolyVertices = numPolyVertices;
+
+	PxgCopyManager::CopyDesc desc;
+	desc.dest = (size_t)devicePtr;
+	desc.bytes = (size_t)byteSize;
+
+	scheduledCopy.mCopyDesc = desc;
+	mScheduledCopies.pushBack(scheduledCopy);
+	
+	HullOrMeshData newHullOrMesh;
+	newHullOrMesh.mDeviceMemPointer = devicePtr;
+	newHullOrMesh.mCopyDescIndex = copyIndex;
+
+	PX_ASSERT(idx < mGeometryData.size());
+	mGeometryData[idx] = newHullOrMesh;
+
+	return idx;
+}
+
+void PxgGeometryManager::addBoxHull()
+{
+	if (mBoxHullIdx != 0xFFffFFff)
+		return;
+
+	PxU64 byteSize = computeBoxHullByteSize();
+
+	mBoxHullIdx = addGeometryInternal(byteSize, NULL, UploadGeometryType::eBOXHULL);
+}
+
+PxU32 PxgGeometryManager::addHull(const ConvexHullData& hull)
+{
+	PxU32 numPolyVertices;
+	PxU64 byteSize = computeHullByteSize(hull, numPolyVertices);
+
+	return addGeometryInternal(byteSize, &hull, UploadGeometryType::eCONVEXHULL, numPolyVertices);
+}
+
+void PxgGeometryManager::removeGeometry(PxU32 idx)
+{
+	PX_ASSERT(idx < mGeometryData.size());
+	HullOrMeshData geometryToRemove = mGeometryData[idx];
+
+	PxU32 scheduledCopyIndex = geometryToRemove.mCopyDescIndex;
+
+	if (scheduledCopyIndex != HullOrMeshData::INVALID_COPY_DESC_INDEX)
+	{
+		PxU32 lastScheduledCopyHullOrTrimeshIndex = mScheduledCopies.back().mHullOrTrimeshIdx;
+
+		if (lastScheduledCopyHullOrTrimeshIndex != idx)
+		{
+			mScheduledCopies.replaceWithLast(scheduledCopyIndex);
+
+			PX_ASSERT(lastScheduledCopyHullOrTrimeshIndex < mGeometryData.size());
+			mGeometryData[lastScheduledCopyHullOrTrimeshIndex].mCopyDescIndex = scheduledCopyIndex;
+		}
+		else
+		{
+			mScheduledCopies.popBack();
+		}
+	}
+
+	const PxPair<void*const, PxgMeshTextureData>* pair = mMeshToTextureMap.find(geometryToRemove.mDeviceMemPointer);
+
+	if (pair)
+	{
+		cuArrayDestroy(pair->second.cuArray);
+		cuTexObjectDestroy(pair->second.cuTexRef);
+
+		if (pair->second.cuArraySubgrids)
+		{
+			cuArrayDestroy(pair->second.cuArraySubgrids);
+			cuTexObjectDestroy(pair->second.cuTexRefSubgrids);
+		}
+
+		mMeshToTextureMap.erase(geometryToRemove.mDeviceMemPointer);
+	}
+
+	mDeviceMemoryAllocator->deallocate(geometryToRemove.mDeviceMemPointer);
+	mFreeGeometryIndices.setFreeIndex(idx);
+}
+
+void PxgGeometryManager::scheduleCopyHtoD(PxgCopyManager& copyMan, PxCudaContext& cudaContext, CUstream stream)
+{
+	// allocate the proper amount of pinned memory
+	mPinnedMemoryBasePtr = mPinnedMemoryAllocator->allocate(mPinnedHostMemoryRequirements, PxsHeapStats::eNARROWPHASE, PX_FL);
+	PxU8* basePtr = reinterpret_cast<PxU8*>(mPinnedMemoryBasePtr);
+
+	for (PxArray<ScheduledCopyData>::Iterator it = mScheduledCopies.begin(), end = mScheduledCopies.end(); it != end; ++it)
+	{
+		PxU64 byteSize = it->mCopyDesc.bytes;
+
+		PxU8* mem = basePtr;
+		basePtr += byteSize;
+
+		PX_ASSERT((byteSize & 255) == 0);
+		PX_ASSERT(((size_t)mem & 255) == 0);
+
+		// layout the geometry in pinned memory depending on type
+		switch (it->mType)
+		{
+			case UploadGeometryType::eTRIANGLEMESH:
+				{
+					PxgMeshTextureData texData = layOutTriMesh(mem, *reinterpret_cast<const TriangleMesh*>(it->mGeometryPtr), stream);
+
+					if (texData.cuArray)
+					{
+						PX_ASSERT(it->mHullOrTrimeshIdx < mGeometryData.size());
+						mMeshToTextureMap.insert(mGeometryData[it->mHullOrTrimeshIdx].mDeviceMemPointer, texData);
+					}
+				}
+				break;
+			case UploadGeometryType::eCONVEXHULL:
+				layOutHull(mem, *reinterpret_cast<const ConvexHullData*>(it->mGeometryPtr), it->mNumPolyVertices);
+				break;
+			case UploadGeometryType::eHEIGHTFIELD:
+				layOutHeightfield(mem, *reinterpret_cast<const HeightFieldData*>(it->mGeometryPtr));
+				break;
+			case UploadGeometryType::eBOXHULL:
+				layOutBoxHull(mem);
+				break;
+			default:
+				PX_ALWAYS_ASSERT();
+		}
+
+		// set the source pointer in the copy descriptor.
+		it->mCopyDesc.source = reinterpret_cast<size_t>(getMappedDevicePtr(&cudaContext, mem));
+
+		// schedule the copies.
+		PX_ASSERT(it->mHullOrTrimeshIdx < mGeometryData.size());
+		mGeometryData[it->mHullOrTrimeshIdx].mCopyDescIndex = HullOrMeshData::INVALID_COPY_DESC_INDEX;	
+		copyMan.pushDeferredHtoD(it->mCopyDesc);
+	}
+
+	mScheduledCopies.forceSize_Unsafe(0);
+	mPinnedHostMemoryRequirements = 0;
+}
+
+void PxgGeometryManager::resetAfterMemcpyCompleted()
+{
+	// AD: we basically never send geometry each frame, and they're usually sized differently anyway, so let's give this memory back
+	// to keep pinned memory usage under control.
+	if (mPinnedMemoryBasePtr)
+	{
+		mPinnedMemoryAllocator->deallocate(mPinnedMemoryBasePtr);
+		mPinnedMemoryBasePtr = NULL;
+	}
+
+	mFreeGeometryIndices.releaseFreeIndices();
+}
+
+CUdeviceptr PxgGeometryManager::getGeometryDevPtrByIndex(PxU32 idx) const
+{
+	PX_ASSERT(idx < mGeometryData.size());
+
+	return reinterpret_cast<CUdeviceptr>(mGeometryData[idx].mDeviceMemPointer);
+}
+
+CUdeviceptr PxgGeometryManager::getBoxHullDevPtr() const
+{
+	PX_ASSERT(mBoxHullIdx != 0xFFffFFff);
+	PX_ASSERT(mBoxHullIdx < mGeometryData.size());
+
+	return reinterpret_cast<CUdeviceptr>(mGeometryData[mBoxHullIdx].mDeviceMemPointer);
+}
+
+PxU32 PxgGeometryManager::addTriMesh(const TriangleMesh& triMesh)
+{
+	PxU64 byteSize = computeTriMeshByteSize(triMesh);
+	
+	return addGeometryInternal(byteSize, &triMesh, UploadGeometryType::eTRIANGLEMESH);
+}
+
+PxU32 PxgGeometryManager::addHeightfield(const HeightFieldData& hf)
+{
+	PxU64 byteSize = computeHeightfieldByteSize(hf);
+
+	return addGeometryInternal(byteSize, &hf, UploadGeometryType::eHEIGHTFIELD);
+}
+
+
+// static functions
+static PxU64 computeBoxHullByteSize()
+{
+	PxU64 byteSize = sizeof(float4) + //center of mass
+		sizeof(PxU32) +               // NbEdgesNbHullVerticesNbPolygons
+		3 * sizeof(PxU32) +           //pad
+		sizeof(float4) +              //extents
+		sizeof(float4) * 8 +          //vertices
+		sizeof(float4) * 6 +          //planes
+		sizeof(PxU32) * 6 +           //vRef8NbVertsMinIndex0 
+		sizeof(PxU16) * 2 * 12 +      //mVerticesByEdges16
+		sizeof(PxU8) * 2 * 12 +       //mFacesByEdges8
+		sizeof(PxU8) * 3 * 8 +        //mFacesByVertices8
+		sizeof(PxU8) * 24;            //vertexData8
+
+	return byteSize;
+}
+
+static PxU64 computeHullByteSize(const ConvexHullData& hull, PxU32& numPolyVertices)
+{
+	numPolyVertices = 0;
+
+	for (PxU32 i = 0; i < hull.mNbPolygons; ++i)
+	{
+		numPolyVertices += hull.mPolygons[i].mNbVerts;
+	}
+
+	return
+		sizeof(float4) + //center of mass
+
+		sizeof(PxU32) + // NbEdgesNbHullVerticesNbPolygons
+		3 * sizeof(PxU32) + //pad
+		sizeof(float4) + //extents
+		sizeof(float4) * hull.mNbHullVertices +//vertices
+		sizeof(float4) * hull.mNbPolygons +//planes
+		sizeof(PxU32) * hull.mNbPolygons +//vRef8NbVertsMinIndex0 
+		sizeof(PxU16) * 2 * hull.mNbEdges +//mVerticesByEdges16
+		sizeof(PxU8) * 2 * hull.mNbEdges +//mFacesByEdges8
+		sizeof(PxU8) * 3 * hull.mNbHullVertices +//mFacesByVertices8
+		sizeof(PxU8) * numPolyVertices;//vertexData8
+}
+
+static PxU64 computeTriMeshByteSize(const TriangleMesh& triMesh)
+{
+	const PxU32 numTris = triMesh.getNbTrianglesFast();
+	const PxU32 numVerts = triMesh.getNbVerticesFast();
+
+	PxU64 meshDataSize =
+		  sizeof(uint4)									// (nbVerts, nbTris, meshAdjVerticiesTotal, nbBv32TreeNodes)
+		+ sizeof(float4) * numVerts						// meshVerts
+		+ sizeof(uint4)  * numTris						// meshTriIndices
+		+ sizeof(uint4)  * numTris						// meshTriAdjacencies
+		+ sizeof(PxU32)  * numTris						// gpu to cpu remap table
+		+ sizeof(PxU32)  * numTris * 3					// vertex to triangle remap table
+		+ sizeof(PxU32)  * (numVerts + 1)				// vertex to triangle offset table
+		;
+
+	// Make sure memory is always 16-byte aligned
+	meshDataSize = (meshDataSize + 15) & ~15;
+
+	meshDataSize += sizeof(uint4);									// (sdfDimX, sdfDimY, sdfDimZ, 0)
+
+	const SDF& sdfData = triMesh.getSdfDataFast();
+	const PxU32 numSdfs = sdfData.mNumSdfs;
+	if (numSdfs > 0)
+	{
+		meshDataSize +=
+			+ sizeof(float4)				// (meshLower.x, meshLower.y, meshLower.z, spacing)
+			+ sizeof(uint4)			
+			+ sizeof(CUtexObject)			// SDF texture object reference
+			+ sizeof(CUtexObject)
+			+ sizeof(PxU32) * sdfData.mNumStartSlots;
+	}
+
+	// Make sure memory is always 16-byte aligned
+	meshDataSize = (meshDataSize + 15) & ~15;
+
+	//ML: don't know whether we need to have local bound
+	const PxU64 bv32Size = triMesh.mGRB_BV32Tree->mNbPackedNodes * sizeof(BV32DataPacked);
+
+	return meshDataSize +  bv32Size;
+}
+
+static PxU64 computeHeightfieldByteSize(const HeightFieldData& hf)
+{
+	
+	/* Height field height data is 16 bit signed integers, followed by triangle materials. 
+		
+		Each sample is 32 bits wide arranged as follows:
+		
+		1) First there is a 16 bit height value.
+		2) Next, two one byte material indices, with the high bit of each byte reserved for special use.
+		(so the material index is only 7 bits).
+		The high bit of material0 is the tess-flag.
+		The high bit of material1 is reserved for future use.
+	*/
+
+	return sizeof(PxU32) //rows
+			+ sizeof(PxU32) //columns
+			+ sizeof(PxU32) * hf.columns * hf.rows
+			+ sizeof(PxU16); //PxHeightFieldFlags
+}
+
+static void layOutBoxHull(void* mem)
+{
+	const float4 vertices[8] = { make_float4(-1.f, 1.f, -1.f, 0.f),
+		make_float4(1.f, 1.f, -1.f, 0.f),
+		make_float4(1.f, -1.f, -1.f, 0.f),
+		make_float4(-1.f, -1.f, -1.f, 0.f),
+		make_float4(-1.f, -1.f, 1.f, 0.f),
+		make_float4(-1.f, 1.f, 1.f, 0.f),
+		make_float4(1.f, -1.f, 1.f, 0.f),
+		make_float4(1.f, 1.f, 1.f, 0.f)
+	};
+
+	const float4 planes[6] = {make_float4(0.f, 0.0f, -1.0f, -1.0f),
+						make_float4(-1.0f, 0.0f, 0.0f, -1.0f),
+						make_float4(0.0f, -1.0f, 0.0f, -1.0f),
+						make_float4(0.0f, 1.0f, 0.0f, -1.0f),
+						make_float4(0.0f, 0.0f, 1.0f, -1.0f),
+						make_float4(1.0f, 0.0f, 0.0f, -1.0f)};
+						
+	const PxU32 polyData[6] = {merge(0, 4, 4),
+						merge(4, 4, 1),
+						merge(8, 4, 0),
+						merge(12, 4, 2),
+						merge(16, 4, 0),
+						merge(20, 4, 0)};
+
+	const PxU16 vertsByEdges[24] = { 0, 1, 3, 0, 5, 0, 1, 2, 7, 1, 2, 3, 2, 6, 3, 4, 4, 5, 6, 4, 5, 7, 6, 7 };
+
+	const PxU8 facesByEdges[24] = {
+								0, 3, 
+								0, 1, 
+								1, 3, 
+								0, 5, 
+
+								3, 5, 
+								0, 2, 
+								2, 5, 
+								1, 2, 
+
+								1, 4, 
+								2, 4, 
+								3, 4, 
+								4, 5 
+							};
+
+	const PxU8 facesByVerts[24] =	{
+								0, 1, 3,
+								0, 3, 5,
+								0, 2, 5,
+								0, 1, 2,
+
+								1, 2, 4,
+								1, 3, 4,
+								2, 4, 5,
+								3, 4, 5
+							};
+
+	const PxU8 polyIndices[24] = {0, 1, 2, 3,
+							4, 5, 0, 3, 
+							4, 3, 2, 6,	
+							7, 1, 0, 5,	
+							7, 5, 4, 6,	
+							7, 6, 2, 1	
+	};
+
+	PxU8* m = reinterpret_cast<PxU8*>(mem);
+	*((float4*)m) = make_float4(0.0f, 0.0f, 0.0f, 0.0f);//center of mass
+	m += sizeof(float4);
+	*((PxU32*)m) = merge(12, merge((PxU8) 8, (PxU8) 6));
+	m += 4 * sizeof(PxU32);
+	*((float4*)m) = make_float4(1.0f, //half-extents and the insphere radius
+							1.0f, 
+							1.0f, 
+							1.0f);
+
+	m += sizeof(float4);
+	PxMemCopy(m, vertices, sizeof(float4) * 8);
+	m += sizeof(float4) * 8;
+	PxMemCopy(m, planes, sizeof(float4) * 6);
+	m += sizeof(float4) * 6;
+	PxMemCopy(m, polyData, sizeof(PxU32) * 6);
+	m += sizeof(PxU32) * 6;
+	PxMemCopy(m, vertsByEdges, sizeof(PxU16) * 2 * 12);
+	m += sizeof(PxU16) * 2 * 12;
+	PxMemCopy(m, facesByEdges, sizeof(PxU8) * 2 * 12);
+	m += sizeof(PxU8) * 2 * 12;
+	PxMemCopy(m, facesByVerts, sizeof(PxU8) * 3 * 8);
+	m += sizeof(PxU8) * 3 * 8;
+	PxMemCopy(m, polyIndices, sizeof(PxU8) * 24);
+}
+
+static void layOutHull(void* mem, const ConvexHullData& hull, PxU32 numPolyVertices)
+{
+	PxU8* m = (PxU8*) mem;
+	*((float4*)m) = make_float4(hull.mCenterOfMass.x, hull.mCenterOfMass.y, hull.mCenterOfMass.z, 0.f);
+	m +=sizeof(float4);
+	*((PxU32*)m) = merge(hull.mNbEdges, merge(hull.mNbHullVertices, hull.mNbPolygons));
+	m += 4 * sizeof(PxU32);
+	*((float4*)m) = make_float4(hull.mInternal.mInternalExtents.x, 
+							hull.mInternal.mInternalExtents.y, 
+							hull.mInternal.mInternalExtents.z, 
+							hull.mInternal.mInternalRadius);
+
+	m += sizeof(float4);
+
+	const PxVec3* verts = hull.getHullVertices();
+
+	for (const PxVec3* end = verts + hull.mNbHullVertices; verts < end; ++verts)
+	{
+		*((float4*)m) = make_float4(verts->x, verts->y, verts->z, 0); 
+		m += sizeof(float4);
+	}
+
+	const HullPolygonData* polys = hull.mPolygons;
+	float4* planes = (float4*)m;
+	PxU32* vRef8NbVertsMinIndex = (PxU32*) (m + sizeof(float4) * hull.mNbPolygons);
+
+	for (const HullPolygonData* end = polys + hull.mNbPolygons; polys < end; ++polys)
+	{
+		*(planes++) = make_float4(polys->mPlane.n.x, polys->mPlane.n.y, polys->mPlane.n.z, polys->mPlane.d);
+		*(vRef8NbVertsMinIndex++) = merge(polys->mVRef8, polys->mNbVerts, polys->mMinIndex); 
+	}
+
+	m += (sizeof(float4) + sizeof(PxU32)) * hull.mNbPolygons;
+
+	PxMemCopy(m, hull.getVerticesByEdges16(), sizeof(PxU16) * 2*hull.mNbEdges);
+	m += sizeof(PxU16) * 2*hull.mNbEdges;
+
+	PxMemCopy(m, hull.getFacesByEdges8(), sizeof(PxU8) * 2*hull.mNbEdges);
+	m += sizeof(PxU8) * 2*hull.mNbEdges;
+
+	PxMemCopy(m, hull.getFacesByVertices8(), sizeof(PxU8) * 3*hull.mNbHullVertices);
+	m += sizeof(PxU8) * 3*hull.mNbHullVertices;
+
+	PxMemCopy(m, hull.getVertexData8(), sizeof(PxU8) * numPolyVertices);
+	//m += sizeof(PxU8) * numPolyVertices;
+}
+
+static PxgMeshTextureData layOutTriMesh(void* mem, const TriangleMesh& triMesh, CUstream stream)
+{
+	const PxU32 numTris = triMesh.getNbTrianglesFast();
+	const PxU32 numVerts = triMesh.getNbVerticesFast();
+
+	BV32Tree* bv32Tree = triMesh.mGRB_BV32Tree;
+
+	PxU8* m = (PxU8*) mem;
+	*((uint4*)m) = make_uint4(triMesh.getNbVerticesFast(), triMesh.getNbTrianglesFast(), 0, bv32Tree->mNbPackedNodes);
+	m += sizeof(uint4);
+
+	//Midphase
+	PxMemCopy(m, bv32Tree->mPackedNodes, sizeof(BV32DataPacked) * bv32Tree->mNbPackedNodes);
+	m += sizeof(BV32DataPacked) * bv32Tree->mNbPackedNodes;
+
+	PX_ASSERT(bv32Tree->mNbPackedNodes > 0);
+
+	// Core: Mesh data
+	//construct gpu friendly verts
+	float4* grbVerts = reinterpret_cast<float4*>(m);
+	const PxVec3* verts = triMesh.getVerticesFast();
+
+	for (PxU32 i = 0; i < numVerts; ++i)
+	{
+		grbVerts[i].x = verts[i].x;
+		grbVerts[i].y = verts[i].y;
+		grbVerts[i].z = verts[i].z;
+		grbVerts[i].w = 0.f;
+	}
+
+	m += numVerts * sizeof(float4);
+	
+	uint4* grbTriInd = reinterpret_cast<uint4*>(m);
+	//copy triangle indices
+	if (triMesh.has16BitIndices())
+	{
+		const PxU16* triInds = reinterpret_cast<PxU16*>(triMesh.mGRB_triIndices);
+		for (PxU32 i = 0; i < numTris; ++i)
+		{
+			grbTriInd[i].x = triInds[3 * i + 0];
+			grbTriInd[i].y = triInds[3 * i + 1];
+			grbTriInd[i].z = triInds[3 * i + 2];
+			grbTriInd[i].w = 0;
+		}
+	}
+	else
+	{
+		const PxU32* triInds = reinterpret_cast<PxU32*>(triMesh.mGRB_triIndices);
+		for (PxU32 i = 0; i < numTris; ++i)
+		{
+			grbTriInd[i].x = triInds[3 * i + 0];
+			grbTriInd[i].y = triInds[3 * i + 1];
+			grbTriInd[i].z = triInds[3 * i + 2];
+			grbTriInd[i].w = 0;
+		}
+	}
+
+	//PxMemCopy(m, triMesh->mGRB_triIndices, numTris * sizeof(uint4));
+	m += numTris * sizeof(uint4);
+
+	//Adjacencies data for contact gen
+	PxMemCopy(m, triMesh.mGRB_triAdjacencies, numTris * sizeof(uint4));
+	m += numTris * sizeof(uint4);
+
+	//GPU to CPU remap table
+	PxMemCopy(m, triMesh.mGRB_faceRemap, numTris * sizeof(PxU32));
+	m += numTris * sizeof(PxU32);
+
+	//GPU to CPU remap table
+	PxMemCopy(m, triMesh.mAccumulatedTrianglesRef, numVerts * sizeof(PxU32));
+	reinterpret_cast<PxU32*>(m)[numVerts] = triMesh.mNbTrianglesReferences;
+	m += (numVerts + 1) * sizeof(PxU32);
+
+	//GPU to CPU remap table
+	PxMemCopy(m, triMesh.mTrianglesReferences, (numTris*3) * sizeof(PxU32));
+	m += (numTris * 3) * sizeof(PxU32);
+
+	// Make sure m is 16-byte aligned
+	m = (PxU8*)(((size_t)m + 15) & ~15);
+
+	// Copy sdf values
+	const SDF& sdfData = triMesh.getSdfDataFast();
+
+	*((uint4*)m) = make_uint4(sdfData.mDims.x, sdfData.mDims.y, sdfData.mDims.z, triMesh.getPreferSDFProjection() ? 1 : 0);
+	m += sizeof(uint4);
+
+	const PxU32	 numSdfs = sdfData.mNumSdfs;
+
+	PxgMeshTextureData result;
+	PxMemZero(&result, sizeof(result));
+	if (numSdfs)
+	{
+		const PxVec3 meshLower = sdfData.mMeshLower;
+		const PxReal spacing = sdfData.mSpacing;
+		*((float4*)m) = make_float4(meshLower.x, meshLower.y, meshLower.z, spacing);
+		m += sizeof(float4);
+
+		*((uint4*)m) = make_uint4(sdfData.mSubgridSize, reinterpret_cast<const PxU32&>(sdfData.mSubgridsMinSdfValue), reinterpret_cast<const PxU32&>(sdfData.mSubgridsMaxSdfValue), sdfData.mNumStartSlots);
+		m += sizeof(uint4);
+
+		CUtexObject* pTexture = (CUtexObject*)m;
+		m += sizeof(CUtexObject);
+
+		CUtexObject* pTextureSubgrids = (CUtexObject*)m;
+		m += sizeof(CUtexObject);
+
+		if (sdfData.mSubgridSize > 0)
+			PxMemCopy(m, sdfData.mSubgridStartSlots, sdfData.mNumStartSlots * sizeof(PxU32));
+		m += sizeof(PxU32) * sdfData.mNumStartSlots;
+
+		if (sdfData.mSubgridSize == 0)
+		{
+			CUarray cuArray;
+			createTextureObject<PxReal>(CU_AD_FORMAT_FLOAT, pTexture, cuArray, sdfData.mDims.x, sdfData.mDims.y, sdfData.mDims.z, sdfData.mSdf, stream);
+
+			result.cuArray = cuArray;
+			result.cuTexRef = *pTexture;
+
+			result.cuArraySubgrids = 0;
+			result.cuTexRefSubgrids = 0;
+		}
+		else
+		{
+			PxU32 x = sdfData.mDims.x / sdfData.mSubgridSize;
+			PxU32 y = sdfData.mDims.y / sdfData.mSubgridSize;
+			PxU32 z = sdfData.mDims.z / sdfData.mSubgridSize;
+
+			CUarray cuArray;
+			createTextureObject<PxReal>(CU_AD_FORMAT_FLOAT, pTexture, cuArray, x + 1, y + 1, z + 1, sdfData.mSdf, stream);
+
+			CUarray cuArraySubgrids = 0;
+			switch(sdfData.mBytesPerSparsePixel)
+			{
+			case 1:
+				createTextureObject<PxU8>(CU_AD_FORMAT_UNSIGNED_INT8, pTextureSubgrids, cuArraySubgrids,
+					sdfData.mSdfSubgrids3DTexBlockDim.x * (sdfData.mSubgridSize + 1),
+					sdfData.mSdfSubgrids3DTexBlockDim.y * (sdfData.mSubgridSize + 1),
+					sdfData.mSdfSubgrids3DTexBlockDim.z * (sdfData.mSubgridSize + 1), sdfData.mSubgridSdf, stream);
+				break;
+			case 2:
+				createTextureObject<PxU16>(CU_AD_FORMAT_UNSIGNED_INT16, pTextureSubgrids, cuArraySubgrids,
+					sdfData.mSdfSubgrids3DTexBlockDim.x * (sdfData.mSubgridSize + 1),
+					sdfData.mSdfSubgrids3DTexBlockDim.y * (sdfData.mSubgridSize + 1),
+					sdfData.mSdfSubgrids3DTexBlockDim.z * (sdfData.mSubgridSize + 1), reinterpret_cast<PxU16*>(sdfData.mSubgridSdf), stream);
+				break;
+			case 4:
+				createTextureObject<PxReal>(CU_AD_FORMAT_FLOAT, pTextureSubgrids, cuArraySubgrids,
+					sdfData.mSdfSubgrids3DTexBlockDim.x * (sdfData.mSubgridSize + 1),
+					sdfData.mSdfSubgrids3DTexBlockDim.y * (sdfData.mSubgridSize + 1),
+					sdfData.mSdfSubgrids3DTexBlockDim.z * (sdfData.mSubgridSize + 1), reinterpret_cast<PxReal*>(sdfData.mSubgridSdf), stream);
+				break;
+			}
+
+			result.cuArray = cuArray;
+			result.cuTexRef = *pTexture;
+
+			if (cuArraySubgrids)
+			{
+				result.cuArraySubgrids = cuArraySubgrids;
+				result.cuTexRefSubgrids = *pTextureSubgrids;
+			}
+			else
+			{
+				result.cuArraySubgrids = 0;
+				result.cuTexRefSubgrids = 0;
+			}
+		}
+	}
+
+	return result;
+}
+
+static void layOutHeightfield(void* mem, const HeightFieldData& hf)
+{
+	PxU8* m = (PxU8*) mem;
+
+	PxU32 nbRows = hf.rows;
+	PxU32 nbCols = hf.columns;
+
+	*((PxU32* ) m) = nbRows;
+	m += sizeof(PxU32);
+
+	*((PxU32* ) m) = nbCols;
+	m += sizeof(PxU32);
+
+	PX_ASSERT(sizeof(PxU32) == sizeof(PxHeightFieldSample));
+	PxMemCopy(m, hf.samples, sizeof(PxU32) * nbRows * nbCols);
+
+	m += sizeof(PxU32) * nbRows * nbCols;
+
+	*((PxU16*)m) = hf.flags;
+}
+
+template<typename T>
+static void createTextureObject(CUarray_format format, CUtexObject*& texture, CUarray& cuArray, PxU32 width, PxU32 height, PxU32 depth, const T* data, CUstream stream)
+{
+	if (width == 0 || height == 0 || depth == 0)
+	{
+		texture = 0;
+		cuArray = 0;
+		return;
+	}
+
+	CUDA_ARRAY3D_DESCRIPTOR_st arrDesc;
+	arrDesc.Format = format;
+	arrDesc.NumChannels = 1;
+	arrDesc.Width = width; 
+	arrDesc.Height = height;
+	arrDesc.Depth = depth; 
+	arrDesc.Flags = 0;
+	CUresult r = cuArray3DCreate(&cuArray, &arrDesc);
+	PX_UNUSED(r);
+	PX_ASSERT(r == CUDA_SUCCESS);
+
+	CUDA_MEMCPY3D copyParam;
+	PxMemZero(&copyParam, sizeof(copyParam));
+	copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+	copyParam.dstArray = cuArray;
+	copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
+	copyParam.srcHost = data;
+	copyParam.srcPitch = width * sizeof(T);
+	copyParam.srcHeight = height;
+	copyParam.WidthInBytes = copyParam.srcPitch;
+	copyParam.Height = height; 
+	copyParam.Depth = depth; 
+
+	cuMemcpy3DAsync(&copyParam, stream);
+
+	CUDA_RESOURCE_DESC resDesc;
+	PxMemZero(&resDesc, sizeof(resDesc));
+	resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
+	resDesc.res.array.hArray = cuArray;	
+
+	CUDA_TEXTURE_DESC texDesc;
+	PxMemZero(&texDesc, sizeof(texDesc));
+	texDesc.addressMode[0] = CU_TR_ADDRESS_MODE_CLAMP;
+	texDesc.addressMode[1] = CU_TR_ADDRESS_MODE_CLAMP;
+	texDesc.addressMode[2] = CU_TR_ADDRESS_MODE_CLAMP;
+	
+	texDesc.filterMode = CU_TR_FILTER_MODE_LINEAR;
+	
+	r = cuTexObjectCreate(texture, &resDesc, &texDesc, NULL);
+	PX_ASSERT(r == CUDA_SUCCESS);
+}
--- a/engine/third_party/physx/source/gpunarrowphase/src/PxgNarrowphase.cpp
+++ b/engine/third_party/physx/source/gpunarrowphase/src/PxgNarrowphase.cpp
@@ -0,0 +1,92 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "PxgBroadPhase.h"
+
+namespace physx
+{
+
+	extern "C" void initNarrowphaseKernels0();
+	extern "C" void initNarrowphaseKernels1();
+	extern "C" void initNarrowphaseKernels2();
+	extern "C" void initNarrowphaseKernels3();
+	extern "C" void initNarrowphaseKernels4();
+	extern "C" void initNarrowphaseKernels5();
+	extern "C" void initNarrowphaseKernels6();
+	extern "C" void initNarrowphaseKernels7();
+	extern "C" void initNarrowphaseKernels8();
+	extern "C" void initNarrowphaseKernels9();
+	extern "C" void initNarrowphaseKernels10();
+	extern "C" void initNarrowphaseKernels11();
+	extern "C" void initNarrowphaseKernels12();
+	extern "C" void initNarrowphaseKernels13();
+	extern "C" void initNarrowphaseKernels14();
+	extern "C" void initNarrowphaseKernels15();
+	extern "C" void initNarrowphaseKernels16();
+	extern "C" void initNarrowphaseKernels17();
+	extern "C" void initNarrowphaseKernels18();
+	extern "C" void initNarrowphaseKernels19();
+	extern "C" void initNarrowphaseKernels20();
+	extern "C" void initNarrowphaseKernels21();
+	extern "C" void initNarrowphaseKernels22();
+	extern "C" void initNarrowphaseKernels23();
+	extern "C" void initNarrowphaseKernels24();
+
+	void createPxgNarrowphase()
+	{
+#if !PX_PHYSX_GPU_EXPORTS
+		//this call is needed to force PhysXNarrowphaseGpu linkage as Static Library!
+		initNarrowphaseKernels0();
+		initNarrowphaseKernels1();
+		initNarrowphaseKernels2();
+		initNarrowphaseKernels3();
+		initNarrowphaseKernels4();
+		initNarrowphaseKernels5();
+		initNarrowphaseKernels6();
+		initNarrowphaseKernels7();
+		initNarrowphaseKernels8();
+		initNarrowphaseKernels9();
+		initNarrowphaseKernels10();
+		initNarrowphaseKernels11();
+		initNarrowphaseKernels12();
+		initNarrowphaseKernels13();
+		initNarrowphaseKernels14();
+		initNarrowphaseKernels15();
+		initNarrowphaseKernels16();
+		initNarrowphaseKernels17();
+		initNarrowphaseKernels18();
+		initNarrowphaseKernels19();
+		initNarrowphaseKernels20();
+		initNarrowphaseKernels21();
+		initNarrowphaseKernels22();
+		initNarrowphaseKernels23();
+		initNarrowphaseKernels24();
+#endif
+	}
+
+}
--- a/engine/third_party/physx/source/gpunarrowphase/src/PxgNarrowphaseCore.cpp
+++ b/engine/third_party/physx/source/gpunarrowphase/src/PxgNarrowphaseCore.cpp
--- a/engine/third_party/physx/source/gpunarrowphase/src/PxgNphaseImplementationContext.cpp
+++ b/engine/third_party/physx/source/gpunarrowphase/src/PxgNphaseImplementationContext.cpp
--- a/engine/third_party/physx/source/gpunarrowphase/src/PxgShapeManager.cpp
+++ b/engine/third_party/physx/source/gpunarrowphase/src/PxgShapeManager.cpp
@@ -0,0 +1,570 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "PxgShapeManager.h"
+#include "PxgCopyManager.h"
+#include "PxgHeapMemAllocator.h"
+#include "PxgCudaUtils.h"
+#include "PxNodeIndex.h"
+
+using namespace physx;
+
+PxgShapeManager::PxgShapeManager(PxgHeapMemoryAllocatorManager* heapManager) :
+	mHeapManager(heapManager),
+	mHostShapes(PxVirtualAllocator(heapManager->mMappedMemoryAllocators, PxsHeapStats::eNARROWPHASE)),
+	mHostShapesRemapTable(PxVirtualAllocator(heapManager->mMappedMemoryAllocators, PxsHeapStats::eNARROWPHASE)),
+	mHostShapeIdTable(PxVirtualAllocator(heapManager->mMappedMemoryAllocators, PxsHeapStats::eNARROWPHASE)),
+	mHostTransformCacheIdToActorTable(PxVirtualAllocator(heapManager->mMappedMemoryAllocators, PxsHeapStats::eNARROWPHASE)),
+	mGpuShapesBuffer(heapManager, PxsHeapStats::eNARROWPHASE),
+	mGpuShapesRemapTableBuffer(heapManager, PxsHeapStats::eNARROWPHASE),
+	mGpuTransformCacheIdToActorTableBuffer(heapManager, PxsHeapStats::eNARROWPHASE),
+	mGpuRigidIndiceBuffer(heapManager, PxsHeapStats::eNARROWPHASE),
+	mGpuShapeIndiceBuffer(heapManager, PxsHeapStats::eNARROWPHASE),
+	mGpuUnsortedShapeIndicesBuffer(heapManager, PxsHeapStats::eNARROWPHASE),
+	mGpuTempRigidBitIndiceBuffer(heapManager, PxsHeapStats::eNARROWPHASE),
+	mGpuTempRigidIndiceBuffer(heapManager, PxsHeapStats::eNARROWPHASE)
+{
+	//allocate x4
+	const PxU32 initialSize = 128;
+	mHostShapes.forceSize_Unsafe(0);
+	mHostShapes.reserve(initialSize);
+	mHostShapes.forceSize_Unsafe(initialSize);
+
+	mHostShapesRemapTable.forceSize_Unsafe(0);
+	mHostShapesRemapTable.reserve(initialSize);
+	mHostShapesRemapTable.forceSize_Unsafe(initialSize);
+
+	mHostShapeIdTable.forceSize_Unsafe(0);
+	mHostShapeIdTable.reserve(initialSize);
+	mHostShapeIdTable.forceSize_Unsafe(initialSize);
+
+	mHostTransformCacheIdToActorTable.forceSize_Unsafe(0);
+	mHostTransformCacheIdToActorTable.reserve(initialSize);
+	mHostTransformCacheIdToActorTable.forceSize_Unsafe(initialSize);
+
+	mGpuShapesBuffer.allocate(sizeof(PxgShape)*initialSize, PX_FL);
+	mGpuShapesRemapTableBuffer.allocate(sizeof(PxNodeIndex) * initialSize, PX_FL);
+	mGpuTransformCacheIdToActorTableBuffer.allocate(sizeof(PxActor*) * initialSize, PX_FL);
+	mGpuRigidIndiceBuffer.allocate(sizeof(PxNodeIndex) * initialSize, PX_FL);
+	mGpuShapeIndiceBuffer.allocate(sizeof(PxU32) * initialSize, PX_FL);
+	mGpuUnsortedShapeIndicesBuffer.allocate(sizeof(PxU32) * initialSize, PX_FL);
+	mGpuTempRigidBitIndiceBuffer.allocate(sizeof(PxU32) * initialSize, PX_FL);
+	mGpuTempRigidIndiceBuffer.allocate(sizeof(PxNodeIndex) * initialSize, PX_FL);
+
+	mResizeRequired = false;
+	mTransformCacheResizeRequired = false;
+	mMaxShapeId = -1;
+	mMaxTransformCacheID = -1;
+
+	mHasShapeChanged = false;
+	mHasShapeInstanceChanged = false;
+}
+
+void PxgShapeManager::initialize(PxCudaContext* cudaContext, CUstream stream)
+{
+	cudaContext->memsetD32Async(mGpuShapesRemapTableBuffer.getDevicePtr(), 0xFFFFFFFF, mGpuShapesRemapTableBuffer.getSize()/sizeof(PxU32), stream);
+	cudaContext->memsetD32Async(mGpuRigidIndiceBuffer.getDevicePtr(), 0xFFFFFFFF, mGpuRigidIndiceBuffer.getSize() / sizeof(PxU32), stream);
+	cudaContext->memsetD32Async(mGpuShapeIndiceBuffer.getDevicePtr(), 0xFFFFFFFF, mGpuShapeIndiceBuffer.getSize() / sizeof(PxU32), stream);
+	cudaContext->memsetD32Async(mGpuUnsortedShapeIndicesBuffer.getDevicePtr(), 0xFFFFFFFF, mGpuUnsortedShapeIndicesBuffer.getSize() / sizeof(PxU32), stream);
+	
+}
+
+PxU32 PxgShapeManager::registerShape(PxgShape& shape)
+{
+	const PxU32 shapeId = mIdPool.getNewID();
+
+	if (shapeId >= mHostShapes.capacity())
+	{
+		mResizeRequired = true;
+		const PxU32 capacity = shapeId * 2;
+
+		//make sure capacity is x4 because we need to use radix sort to sort shape id based on rigid body index later
+		const PxU32 tempCapacity = (capacity + 3)&(~3);
+		mHostShapes.resize(tempCapacity);
+		mDirtyShapeMap.resize(tempCapacity);
+	}
+
+	mHostShapes[shapeId] = shape;
+	mDirtyShapeMap.growAndSet(shapeId);
+
+	mMaxShapeId = PxMax(PxI32(shapeId), mMaxShapeId);
+
+	mHasShapeChanged = true;
+
+	return shapeId;
+}
+
+void PxgShapeManager::registerShapeInstance(const PxNodeIndex& nodeIndex, const PxU32 transformCacheID, PxActor* actor, bool aggregate)
+{
+	if (transformCacheID >= mHostShapesRemapTable.capacity())
+	{
+		const PxU32 capacity = transformCacheID*2;
+		//make sure capacity is x4 because we need to use radix sort to sort shape id based on rigid body index later
+		const PxU32 tempCapacity = (capacity + 3)&(~3);
+		mTransformCacheResizeRequired = true;
+		mHostShapesRemapTable.resize(tempCapacity);
+		mHostShapeIdTable.resize(tempCapacity);
+		mHostTransformCacheIdToActorTable.resize(tempCapacity);
+		mDirtyTransformCacheMap.resize(tempCapacity);
+	}
+		
+	mHostShapesRemapTable[transformCacheID] = nodeIndex;
+	mHostShapeIdTable[transformCacheID] = aggregate? 0xffffffff : transformCacheID;
+	mHostTransformCacheIdToActorTable[transformCacheID] = aggregate ? NULL : actor;
+	mHasShapeInstanceChanged = true;
+	mDirtyTransformCacheMap.growAndSet(transformCacheID);
+	mMaxTransformCacheID = PxMax(PxI32(transformCacheID), mMaxTransformCacheID);
+}
+
+void PxgShapeManager::unregisterShape(const PxU32 id)
+{
+	mDirtyShapeMap.reset(id);
+	mIdPool.deferredFreeID(id);
+	mHasShapeChanged = true;
+}
+
+void PxgShapeManager::unregisterShapeInstance(const PxU32 transformCacheID)
+{
+	mDirtyTransformCacheMap.set(transformCacheID);
+	mHostShapesRemapTable[transformCacheID] = PxNodeIndex(PX_INVALID_NODE);
+	mHostShapeIdTable[transformCacheID] = 0xffffffff;
+	mHostTransformCacheIdToActorTable[transformCacheID] = NULL;
+	mHasShapeInstanceChanged = true;
+}
+
+void PxgShapeManager::scheduleCopyHtoD(PxgCopyManager& copyManager, PxCudaContext* cudaContext, CUstream stream)
+{
+	PX_UNUSED(copyManager);
+
+	const PxU32 maxGrouping = 16;
+
+	if (mHasShapeChanged)
+	{
+		mHasShapeChanged = false;
+
+		if (mResizeRequired)
+		{
+			//Allocate and copy data across
+			mGpuShapesBuffer.allocateCopyOldDataAsync(sizeof(PxgShape)*mHostShapes.capacity(), cudaContext, stream, PX_FL);
+
+			mResizeRequired = false;
+		}
+
+		const PxU32* bits = mDirtyShapeMap.getWords();
+		if (bits)
+		{
+			const PxU32 totalNumOfShapes = mMaxShapeId + 1;
+			const PxU32 numShapes = (totalNumOfShapes + 3) &(~3);
+
+			//make sure the dirty shape map cover x4 case and set those to invalid value
+			for (PxU32 i = totalNumOfShapes; i < numShapes; ++i)
+			{
+				mDirtyShapeMap.growAndSet(i);
+			}
+
+			// PT: ### bitmap iterator pattern
+			const PxU32 lastSetBit = mDirtyShapeMap.findLast();
+			for (PxU32 w = 0; w <= lastSetBit >> 5; ++w)
+			{
+				//b&=b-1 will clear the lowest set bit in b
+				for (PxU32 b = bits[w]; b; )
+				{
+					//dirtyId is the next bit that's set to 1!
+					const PxU32 dirtyId = PxU32(w << 5 | PxLowestSetBit(b));
+
+					void* hostPtr = mHostShapes.begin() + dirtyId;
+
+					PxgCopyManager::CopyDesc desc;
+					desc.source = reinterpret_cast<size_t>(getMappedDevicePtr(cudaContext, hostPtr));
+					desc.dest = reinterpret_cast<size_t>(reinterpret_cast<PxU8*>(mGpuShapesBuffer.getDevicePtr()) + dirtyId * sizeof(PxgShape));
+					desc.bytes = sizeof(PxgShape);
+
+					mDirtyShapeMap.reset(dirtyId);
+					//Now we loop to try and find adjacent bits that are set...
+					PxU32 currIdx = dirtyId + 1;
+					PxU32 groupSize = 1;
+					while (currIdx <= lastSetBit && mDirtyShapeMap.test(currIdx) && groupSize < maxGrouping)
+					{
+						groupSize++;
+						mDirtyShapeMap.reset(currIdx);
+						currIdx++;
+						desc.bytes += sizeof(PxgShape);
+					}
+
+					if (currIdx != (dirtyId + 1))
+					{
+						//get the word from the current bit
+						w = PxMin(currIdx, lastSetBit) >> 5;
+						//reload the world
+						b = bits[w]; //Set a 1 here to make sure the b &= (b-1) in the for loop doesn't remove the current bit we're interested in
+					}
+					else
+					{
+						b &= (b - 1);
+					}
+
+					copyManager.pushDeferredHtoD(desc);
+				}
+			}
+		}
+
+		mDirtyShapeMap.clear();
+	}
+
+	if (mHasShapeInstanceChanged)
+	{
+		//AD: mHasShapeInstanceChanged needs to persist because computeRigidsToShapes() needs to run if we use direct-API 
+		//    we lower the flag in PxgNarrowphaseCore::prepareGpuNarrowphase.
+
+		// AD: the resize of the GPU transform cache is inside PxgNarrowphaseCore::prepareGpuNarrowphase.
+		if (mTransformCacheResizeRequired)
+		{
+			PxU64 oldCapacity = mGpuShapesRemapTableBuffer.getSize();
+			mGpuShapesRemapTableBuffer.allocateCopyOldDataAsync(sizeof(PxNodeIndex)*mHostShapesRemapTable.capacity(), cudaContext, stream, PX_FL);
+			cudaContext->memsetD32Async(mGpuShapesRemapTableBuffer.getDevicePtr() + oldCapacity, 0xFFFFFFFF, (mGpuShapesRemapTableBuffer.getSize() - oldCapacity) / sizeof(PxU32), stream);
+
+			oldCapacity = mGpuRigidIndiceBuffer.getSize();
+			mGpuRigidIndiceBuffer.allocateCopyOldDataAsync(sizeof(PxNodeIndex) * mHostShapesRemapTable.capacity(), cudaContext, stream, PX_FL);
+			cudaContext->memsetD32Async(mGpuRigidIndiceBuffer.getDevicePtr() + oldCapacity, 0xFFFFFFFF, (mGpuRigidIndiceBuffer.getSize() - oldCapacity) / sizeof(PxU32), stream);
+
+			mGpuTempRigidIndiceBuffer.allocate(sizeof(PxNodeIndex) * mHostShapesRemapTable.capacity(), PX_FL);
+
+			oldCapacity = mGpuShapeIndiceBuffer.getSize();
+			mGpuShapeIndiceBuffer.allocateCopyOldDataAsync(sizeof(PxU32) * mHostShapeIdTable.capacity(), cudaContext, stream, PX_FL);
+			cudaContext->memsetD32Async(mGpuShapeIndiceBuffer.getDevicePtr() + oldCapacity, 0xFFFFFFFF, (mGpuShapeIndiceBuffer.getSize() - oldCapacity) / sizeof(PxU32), stream);
+
+			oldCapacity = mGpuUnsortedShapeIndicesBuffer.getSize();
+			mGpuUnsortedShapeIndicesBuffer.allocateCopyOldDataAsync(sizeof(PxU32) * mHostShapeIdTable.capacity(), cudaContext, stream, PX_FL);
+			cudaContext->memsetD32Async(mGpuUnsortedShapeIndicesBuffer.getDevicePtr() + oldCapacity, 0xFFFFFFFF, (mGpuUnsortedShapeIndicesBuffer.getSize() - oldCapacity) / sizeof(PxU32), stream);
+
+
+			mGpuTempRigidBitIndiceBuffer.allocate(sizeof(PxU32) * mHostShapeIdTable.capacity(), PX_FL);
+
+			oldCapacity = mGpuTransformCacheIdToActorTableBuffer.getSize();
+			mGpuTransformCacheIdToActorTableBuffer.allocateCopyOldDataAsync(sizeof(PxActor*) * mHostTransformCacheIdToActorTable.capacity(), cudaContext, stream, PX_FL);
+			cudaContext->memsetD32Async(mGpuTransformCacheIdToActorTableBuffer.getDevicePtr() + oldCapacity, 0, (mGpuTransformCacheIdToActorTableBuffer.getSize() - oldCapacity) / sizeof(PxActor*), stream);
+
+			mTransformCacheResizeRequired = false;
+		}
+
+		const PxU32 totalNumOfShapeInstances = mMaxTransformCacheID + 1;
+		const PxU32 numShapeInstances = (totalNumOfShapeInstances + 3) &(~3);
+
+		//make sure the dirty shape map cover x4 case and set those to invalid value
+		for (PxU32 i = totalNumOfShapeInstances; i < numShapeInstances; ++i)
+		{
+			if (!mHostShapesRemapTable[i].isStaticBody())
+			{
+				mDirtyTransformCacheMap.growAndSet(i);
+				mHostShapesRemapTable[i] = PxNodeIndex(PX_INVALID_NODE);
+				mHostShapeIdTable[i] = 0xffffffff;
+				mHostTransformCacheIdToActorTable[i] = NULL;
+			}
+		}
+
+		const PxU32* bits = mDirtyTransformCacheMap.getWords();
+
+		if (bits)
+		{
+			// PT: ### bitmap iterator pattern
+			const PxU32 lastSetBit = mDirtyTransformCacheMap.findLast();
+			for (PxU32 w = 0; w <= lastSetBit >> 5; ++w)
+			{
+				//b&=b-1 will clear the lowest set bit in b
+				for (PxU32 b = bits[w]; b; )
+				{
+					//dirtyId is the next bit that's set to 1!
+					const PxU32 dirtyId = PxU32(w << 5 | PxLowestSetBit(b));
+
+					void* hostRemapPtr = mHostShapesRemapTable.begin() + dirtyId;
+
+					void* hostShapeIdPtr = mHostShapeIdTable.begin() + dirtyId;
+
+					void* hostTransformCacheIdToActorPtr = mHostTransformCacheIdToActorTable.begin() + dirtyId;
+
+					PxgCopyManager::CopyDesc desc1;
+					desc1.source = reinterpret_cast<size_t>(getMappedDevicePtr(cudaContext, hostRemapPtr));
+					desc1.dest = reinterpret_cast<size_t>(reinterpret_cast<PxU8*>(mGpuShapesRemapTableBuffer.getDevicePtr()) + dirtyId * sizeof(PxNodeIndex));
+					desc1.bytes = sizeof(PxNodeIndex);
+
+					PxgCopyManager::CopyDesc desc2;
+					desc2.source = reinterpret_cast<size_t>(getMappedDevicePtr(cudaContext, hostRemapPtr));
+					desc2.dest = reinterpret_cast<size_t>(reinterpret_cast<PxU8*>(mGpuRigidIndiceBuffer.getDevicePtr()) + dirtyId * sizeof(PxNodeIndex));
+					desc2.bytes = sizeof(PxNodeIndex);
+
+					PxgCopyManager::CopyDesc desc3;
+					desc3.source = reinterpret_cast<size_t>(getMappedDevicePtr(cudaContext, hostShapeIdPtr));
+					desc3.dest = reinterpret_cast<size_t>(reinterpret_cast<PxU8*>(mGpuUnsortedShapeIndicesBuffer.getDevicePtr()) + dirtyId * sizeof(PxU32));
+					desc3.bytes = sizeof(PxU32);
+
+					PxgCopyManager::CopyDesc desc4;
+					desc4.source = reinterpret_cast<size_t>(getMappedDevicePtr(cudaContext, hostTransformCacheIdToActorPtr));
+					desc4.dest = reinterpret_cast<size_t>(reinterpret_cast<PxU8*>(mGpuTransformCacheIdToActorTableBuffer.getDevicePtr()) + dirtyId * sizeof(PxActor*));
+					desc4.bytes = sizeof(PxActor*);
+
+					mDirtyTransformCacheMap.reset(dirtyId);
+					//Now we loop to try and find adjacent bits that are set...
+					PxU32 currIdx = dirtyId + 1;
+					PxU32 groupSize = 1;
+					while (currIdx <= lastSetBit && mDirtyTransformCacheMap.test(currIdx) && groupSize < maxGrouping)
+					{
+						groupSize++;
+						mDirtyTransformCacheMap.reset(currIdx);
+						currIdx++;
+						desc1.bytes += sizeof(PxNodeIndex);
+						desc2.bytes += sizeof(PxNodeIndex);
+						desc3.bytes += sizeof(PxU32);
+						desc4.bytes += sizeof(PxActor*);
+					}
+
+					if (currIdx != (dirtyId + 1))
+					{
+						//get the word from the current bit
+						w = PxMin(currIdx, lastSetBit) >> 5;
+						//reload the world
+						b = bits[w]; //Set a 1 here to make sure the b &= (b-1) in the for loop doesn't remove the current bit we're interested in
+					}
+					else
+					{
+						b &= (b - 1);
+					}
+
+					copyManager.pushDeferredHtoD(desc1);
+					copyManager.pushDeferredHtoD(desc2);
+					copyManager.pushDeferredHtoD(desc3);
+					copyManager.pushDeferredHtoD(desc4);
+				}
+			}
+		}
+		mDirtyTransformCacheMap.clear();
+	}
+}
+
+void PxgShapeManager::updateShapeMaterial(const PxU32 materialIndex, const PxU32 id)
+{
+	PX_ASSERT(id < mHostShapes.size());
+	mHostShapes[id].materialIndex = materialIndex;
+	mDirtyShapeMap.growAndSet(id);
+	mHasShapeChanged = true;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////
+
+PxgMaterialManager::PxgMaterialManager(PxgHeapMemoryAllocatorManager* heapManager, const PxU32 elemSize) :
+	mGpuMaterialBuffer(heapManager, PxsHeapStats::eNARROWPHASE),
+	mHeapManager(heapManager),
+	mHostMaterial(PxVirtualAllocator(heapManager->mMappedMemoryAllocators, PxsHeapStats::eNARROWPHASE))
+{
+	const PxU32 originalSize = elemSize * 128;
+	mHostMaterial.forceSize_Unsafe(0);
+	mHostMaterial.reserve(originalSize);
+	mHostMaterial.forceSize_Unsafe(originalSize);
+
+	mGpuMaterialBuffer.allocate(originalSize, PX_FL);
+	mResizeRequired = false;
+}
+
+PxU32 PxgMaterialManager::registerMaterial(const PxU8* materialData, const PxU32 elemSize)
+{
+	const PxU32 shapeId = mIdPool.getNewID();
+	PxU32 capacity = mHostMaterial.capacity() / elemSize;
+	
+	if (shapeId >= capacity)
+	{
+		capacity = PxMax(capacity * 2 + 1, shapeId + 1);
+		mHostMaterial.resize(capacity * elemSize);
+		mResizeRequired = true;
+	}
+
+	PxU8* destPtr = mHostMaterial.begin() + shapeId * elemSize;
+	PxMemCopy(destPtr, materialData, elemSize);
+		
+	mDirtyMaterialMap.growAndSet(shapeId);
+
+	return shapeId;
+}
+
+void PxgMaterialManager::unregisterMaterial(const PxU32 id)
+{
+	mDirtyMaterialMap.reset(id);
+	mIdPool.deferredFreeID(id);
+}
+
+void PxgMaterialManager::scheduleCopyHtoD(PxgCopyManager& copyManager, PxCudaContext* cudaContext, 
+	CUstream stream, const PxU32 elemSize)
+{
+	if (mResizeRequired)
+	{
+		mGpuMaterialBuffer.allocateCopyOldDataAsync(mHostMaterial.capacity(), cudaContext, stream, PX_FL);
+		mResizeRequired = false;
+	}
+	const PxU32* bits = mDirtyMaterialMap.getWords();
+
+	const PxU32 maxGrouping = 16;
+
+	if (bits)
+	{
+		// PT: ### bitmap iterator pattern
+		const PxU32 lastSetBit = mDirtyMaterialMap.findLast();
+		for (PxU32 w = 0; w <= lastSetBit >> 5; ++w)
+		{
+			//b&=b-1 will clear the lowest set bit in b
+			for (PxU32 b = bits[w]; b; )
+			{
+				//dirtyId is the next bit that's set to 1!
+				const PxU32 dirtyId = PxU32(w << 5 | PxLowestSetBit(b));
+
+				void* hostPtr = mHostMaterial.begin() + dirtyId * elemSize;
+
+				PxgCopyManager::CopyDesc desc;
+				desc.source = reinterpret_cast<size_t>(getMappedDevicePtr(cudaContext, hostPtr));
+				desc.dest = reinterpret_cast<size_t>(reinterpret_cast<PxU8*>(mGpuMaterialBuffer.getDevicePtr()) + dirtyId * elemSize);
+				desc.bytes = elemSize;
+
+				mDirtyMaterialMap.reset(dirtyId);
+
+				//Now we loop to try and find adjacent bits that are set...
+				PxU32 currIdx = dirtyId + 1;
+				PxU32 groupSize = 1;
+				while (currIdx <= lastSetBit && mDirtyMaterialMap.test(currIdx) && (groupSize < maxGrouping))
+				{
+					groupSize++;
+					mDirtyMaterialMap.reset(currIdx);
+					currIdx++;
+					desc.bytes += elemSize;
+				}
+
+				if (currIdx != (dirtyId + 1))
+				{
+					//get the word from the current bit
+					w = PxMin(currIdx, lastSetBit) >> 5;
+					//reload the world
+					b = bits[w]; //Set a 1 here to make sure the b &= (b-1) in the for loop doesn't remove the current bit we're interested in
+				}
+				else
+				{
+					b &= (b - 1);
+				}
+
+				copyManager.pushDeferredHtoD(desc);
+			}
+		}
+	}
+
+	mDirtyMaterialMap.clear();
+}
+
+void PxgMaterialManager::updateMaterial(const PxU8* materialCore, const PxU32 elemSize, const PxU32 id)
+{
+	PX_ASSERT(id < mHostMaterial.size());
+	PxU8* destptr = reinterpret_cast<PxU8*>(mHostMaterial.begin() + id * elemSize);
+	PxMemCopy(destptr, materialCore, elemSize);
+	//mHostMaterial[id] = materialCore;
+	mDirtyMaterialMap.growAndSet(id);
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////
+	
+PxgFEMMaterialManager::PxgFEMMaterialManager(PxgHeapMemoryAllocatorManager* heapManager, const PxU32 elemSize) :
+	PxgMaterialManager(heapManager, elemSize)
+{
+
+}
+
+void PxgFEMMaterialManager::scheduleCopyHtoD(PxgCopyManager& copyManager, PxCudaContext* cudaContext,
+	CUstream stream, const PxU32 elemSize)
+{
+	if (mResizeRequired)
+	{
+		mGpuMaterialBuffer.allocateCopyOldDataAsync(mHostMaterial.capacity(), cudaContext, stream, PX_FL);
+		mResizeRequired = false;
+	}
+
+
+	const PxU32* bits = mDirtyMaterialMap.getWords();
+
+	const PxU32 maxGrouping = 16;
+
+	if (bits)
+	{
+		// PT: ### bitmap iterator pattern
+		const PxU32 lastSetBit = mDirtyMaterialMap.findLast();
+		for (PxU32 w = 0; w <= lastSetBit >> 5; ++w)
+		{
+			//b&=b-1 will clear the lowest set bit in b
+			for (PxU32 b = bits[w]; b; )
+			{
+				//dirtyId is the next bit that's set to 1!
+				const PxU32 dirtyId = PxU32(w << 5 | PxLowestSetBit(b));
+
+				void* hostPtr = mHostMaterial.begin() + dirtyId * elemSize;
+
+				PxgCopyManager::CopyDesc desc;
+				desc.source = reinterpret_cast<size_t>(getMappedDevicePtr(cudaContext, hostPtr));
+				desc.dest = reinterpret_cast<size_t>(reinterpret_cast<PxU8*>(mGpuMaterialBuffer.getDevicePtr()) + dirtyId * elemSize);
+				desc.bytes = elemSize;
+
+				mDirtyMaterialMap.reset(dirtyId);
+
+				//Now we loop to try and find adjacent bits that are set...
+				PxU32 currIdx = dirtyId + 1;
+				PxU32 groupSize = 1;
+				while (currIdx <= lastSetBit && mDirtyMaterialMap.test(currIdx) && (groupSize < maxGrouping))
+				{
+					groupSize++;
+					mDirtyMaterialMap.reset(currIdx);
+					currIdx++;
+					desc.bytes += elemSize;
+				}
+
+				if (currIdx != (dirtyId + 1))
+				{
+					//get the word from the current bit
+					w = PxMin(currIdx, lastSetBit) >> 5;
+					//reload the world
+					b = bits[w]; //Set a 1 here to make sure the b &= (b-1) in the for loop doesn't remove the current bit we're interested in
+				}
+				else
+				{
+					b &= (b - 1);
+				}
+
+				copyManager.pushDeferredHtoD(desc);
+			}
+		}
+	}
+
+	mDirtyMaterialMap.clear();
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////
+	
+PxgFEMSoftBodyMaterialManager::PxgFEMSoftBodyMaterialManager(PxgHeapMemoryAllocatorManager* heapManager) :
+	PxgFEMMaterialManager(heapManager, sizeof(PxsDeformableVolumeMaterialData))
+{
+}