feat(physics): wire physx sdk into build

2026-04-15 12:22:15 +08:00
parent 5bf258df6d
commit 31f40e2cbb
2044 changed files with 752623 additions and 1 deletions
--- a/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/FEMCloth.cu
+++ b/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/FEMCloth.cu
--- a/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/FEMClothConstraintPrep.cu
+++ b/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/FEMClothConstraintPrep.cu
@@ -0,0 +1,595 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "PxgFEMCloth.h"
+#include "PxgFEMCore.h"
+#include "PxgFEMClothCore.h"
+#include "vector_types.h"
+#include "foundation/PxVec3.h"
+#include "foundation/PxMathUtils.h"
+#include "copy.cuh"
+#include "assert.h"
+#include "stdio.h"
+#include "PxgSolverCoreDesc.h"
+#include "PxNodeIndex.h"
+#include "PxgBodySim.h"
+#include "PxgArticulation.h"
+#include "PxgParticleSystem.h"
+#include "PxgNpKernelIndices.h"
+#include "PxgSimulationCoreDesc.h"
+#include "PxsDeformableSurfaceMaterialCore.h"
+#include "utils.cuh"
+#include "deformableUtils.cuh"
+#include "deformableCollision.cuh"
+
+using namespace physx;
+
+extern "C" __host__ void initFEMClothKernels0() {}
+
+static __device__ inline float4 computeBarycentricPos(const uint4 triangleIdx, const float4* PX_RESTRICT position_invmass,
+													  const float4 barycentric)
+{
+	const float4 a = position_invmass[triangleIdx.x];
+	const float4 b = position_invmass[triangleIdx.y];
+	const float4 c = position_invmass[triangleIdx.z];
+
+	const float4 posInvMass = a * barycentric.x + b * barycentric.y + c * barycentric.z;
+
+	return posInvMass;
+}
+
+//! 
+//! \brief    : prep cloth vs. rigid body collision
+//! 
+
+extern "C" __global__ void cloth_rigidContactPrepareLaunch(
+	PxgFEMCloth*					femClothes,
+	float4*							contacts_restW,
+	float4*							normalPens,
+	float4*							barycentrics,
+	const PxgFemOtherContactInfo*	contactInfos,
+	PxU32*							numContacts,
+	PxgFemRigidConstraintBlock*		primitiveConstraints,
+	PxgPrePrepDesc*					preDesc,
+	PxgConstraintPrepareDesc*		prepareDesc,
+	PxReal* 						rigidLambdaNs, 
+	const PxReal					invDt,
+	PxgSolverSharedDescBase*		sharedDesc,
+	bool isTGS
+)
+{
+	const PxU32 tNumContacts = *numContacts;
+
+	PxU32* solverBodyIndices = preDesc->solverBodyIndices;
+	const PxU32 nbBlocksRequired = (tNumContacts + blockDim.x - 1) / blockDim.x;
+	const PxU32 nbIterationsPerBlock = (nbBlocksRequired + gridDim.x - 1) / gridDim.x;
+	const PxU32 idx = threadIdx.x;
+
+	for(PxU32 i = 0; i < nbIterationsPerBlock; ++i)
+	{
+		const PxU32 workIndex = i * blockDim.x + idx + nbIterationsPerBlock * blockIdx.x * blockDim.x;
+
+		if(workIndex >= tNumContacts)
+			return;
+
+		rigidLambdaNs[workIndex] = 0.0f;
+
+		PxgFemOtherContactInfo contactInfo = contactInfos[workIndex];
+		PxgFemRigidConstraintBlock& constraint = primitiveConstraints[workIndex / 32];
+
+		PxU64 pairInd0 = contactInfo.pairInd0;
+
+		// First one is rigid body
+		const PxU64 tRigidId = pairInd0;
+		const PxNodeIndex& rigidId = reinterpret_cast<const PxNodeIndex&>(tRigidId);
+
+		// Second one is cloth
+		PxU32 pairInd1 = PxU32(contactInfo.pairInd1);
+
+		PxgFEMCloth& cloth = femClothes[PxGetClothId(pairInd1)];
+		const PxU32 elementId = PxGetClothElementIndex(pairInd1);
+
+		if(elementId == 0xfffff)
+			continue;
+
+		const float4* PX_RESTRICT accumDelta_invMass = cloth.mAccumulatedDeltaPos;
+
+		const float4 contact_restW = contacts_restW[workIndex];
+		const float4 normal_pen = normalPens[workIndex];
+
+		const PxVec3 p(contact_restW.x, contact_restW.y, contact_restW.z);
+
+		float4 barycentric = barycentrics[workIndex];
+
+		float4 deltaP;
+		if(barycentric.w == 0.f)
+		{
+			const uint4 vertexIndices = cloth.mTriangleVertexIndices[elementId];
+			deltaP = computeBarycentricPos(vertexIndices, accumDelta_invMass, barycentric);
+		}
+		else
+		{
+			deltaP = accumDelta_invMass[elementId];
+		}
+
+		const PxVec3 normal(-normal_pen.x, -normal_pen.y, -normal_pen.z);
+		const PxReal pen = normal_pen.w - contact_restW.w;
+
+		const PxVec3 delta(deltaP.x, deltaP.y, deltaP.z);
+
+		prepareFEMContacts(constraint, normal, sharedDesc, p, pen, delta, rigidId, barycentric, prepareDesc, solverBodyIndices, cloth.mPenBiasClamp, invDt, isTGS);
+	}
+}
+
+
+//!
+//! \brief    : prep cloth vs. cloth collision.
+//!
+
+extern "C" __global__ 
+void cloth_clothContactPrepareLaunch(
+	PxgFEMCloth*						clothes,
+	PxgFemFemContactInfo*				contactInfos,
+	PxU32*								numContacts,
+	PxU32								maxContacts,
+	PxsDeformableSurfaceMaterialData*	clothMaterials,
+	const PxU8*							updateContactPairs
+)
+{
+	// Early exit if contact pairs are not updated.
+	if(*updateContactPairs == 0)
+		return;
+
+	const PxU32 tNumContacts = PxMin(*numContacts, maxContacts);
+	const PxU32 nbBlocksRequired = (tNumContacts + blockDim.x - 1) / blockDim.x;
+	const PxU32 nbIterationsPerBlock = (nbBlocksRequired + gridDim.x - 1) / gridDim.x;
+
+	const PxU32 idx = threadIdx.x;
+
+	for(PxU32 i = 0; i < nbIterationsPerBlock; ++i)
+	{
+		const PxU32 workIndex = i * blockDim.x + idx + nbIterationsPerBlock * blockIdx.x * blockDim.x;
+
+		if(workIndex == 0) // Clamp the cloth contact count.
+		{
+			*numContacts = tNumContacts;
+		}
+
+		if(workIndex >= tNumContacts)
+		{
+			return;
+		}
+
+		PxgFemFemContactInfo contactInfo = contactInfos[workIndex];
+		if(contactInfo.isValidPair()) // For different cloths, contactInfo is already set to valid.
+		{
+			continue;
+		}
+
+		const PxU32 pairInd0 = PxU32(contactInfo.pairInd0);
+		PxgFEMCloth& cloth0 = clothes[PxGetClothId(pairInd0)];
+		const PxU32 elementId0 = PxGetClothElementIndex(pairInd0);
+
+		PxU32 pairInd1 = PxU32(contactInfo.pairInd1);
+		PxgFEMCloth& cloth1 = clothes[PxGetClothId(pairInd1)];
+		const PxU32 elementId1 = PxGetClothElementIndex(pairInd1);
+
+		if(contactInfo.isEdgeEdgePair()) // Edge-edge collision
+		{
+			// Edge0
+			PxU32 e0_localIndex0 = contactInfo.getAuxInd0();
+			PxU32 e0_localIndex1 = (e0_localIndex0 + 1) % 3;
+
+			const uint4 triVertInd0 = cloth0.mTriangleVertexIndices[elementId0];
+			const PxU32* vertexIndices0 = reinterpret_cast<const PxU32*>(&triVertInd0);
+			const PxU32 e0_v0 = vertexIndices0[e0_localIndex0];
+			const PxU32 e0_v1 = vertexIndices0[e0_localIndex1];
+
+			// Edge1
+			PxU32 e1_localIndex0 = contactInfo.getAuxInd1();
+			PxU32 e1_localIndex1 = (e1_localIndex0 + 1) % 3;
+
+			const uint4 triVertInd1 = cloth1.mTriangleVertexIndices[elementId1];
+			const PxU32* vertexIndices1 = reinterpret_cast<const PxU32*>(&triVertInd1);
+			const PxU32 e1_v0 = vertexIndices1[e1_localIndex0];
+			const PxU32 e1_v1 = vertexIndices1[e1_localIndex1];
+
+			// Compute the exact rest distance for filtering.
+			// Mark pairs as valid if their rest distance exceeds the filter threshold.
+			const PxVec3 r0 = PxLoad3(cloth0.mRestPosition[e0_v0]);
+			const PxVec3 r1 = PxLoad3(cloth0.mRestPosition[e0_v1]);
+			const PxVec3 r2 = PxLoad3(cloth1.mRestPosition[e1_v0]);
+			const PxVec3 r3 = PxLoad3(cloth1.mRestPosition[e1_v1]);
+
+			// Linear blending coefficients for edge0 and edge1, respectively.
+			PxReal s, t;
+			PxReal restDistSq;
+
+			// Computationally more expensive than closestPtLineLine.
+			closestPtEdgeEdge(r0, r1, r2, r3, s, t, restDistSq);
+
+			// Apply exact (non-approximated) rest distance filtering.
+			if(restDistSq > cloth0.mSelfCollisionFilterDistance * cloth0.mSelfCollisionFilterDistance)
+			{
+				contactInfo.markValid();
+				contactInfos[workIndex] = contactInfo;
+			}
+		}
+		else // Vertex-triangle collision
+		{
+			const uint4 triVertId1 = cloth1.mTriangleVertexIndices[elementId1];
+			PxVec4T<PxU32> vertIndices(elementId0, triVertId1.x, triVertId1.y, triVertId1.z);
+
+			// Compute the exact rest distance for filtering.
+			// Mark pairs as valid if their rest distance exceeds the filter threshold.
+			const PxVec3 r0 = PxLoad3(cloth0.mRestPosition[vertIndices[0]]);
+			const PxVec3 r1 = PxLoad3(cloth1.mRestPosition[vertIndices[1]]);
+			const PxVec3 r2 = PxLoad3(cloth1.mRestPosition[vertIndices[2]]);
+			const PxVec3 r3 = PxLoad3(cloth1.mRestPosition[vertIndices[3]]);
+
+			const PxVec3 r12 = r2 - r1;
+			const PxVec3 r13 = r3 - r1;
+
+			// Apply exact (non-approximated) rest distance filtering.
+			const PxVec3 closest = Gu::closestPtPointTriangle2(r0, r1, r2, r3, r12, r13);
+
+			const PxReal restDistSq = (r0 - closest).magnitudeSquared();
+			if(restDistSq > cloth0.mSelfCollisionFilterDistance * cloth0.mSelfCollisionFilterDistance)
+			{
+				contactInfo.markValid();
+				contactInfos[workIndex] = contactInfo;
+			}
+		}
+	}
+}
+
+static __device__ float4 computeTriangleContact(const float4* vels, const uint4& triVertId,
+	const float4& barycentric)
+{
+	const float4 v0 = vels[triVertId.x];
+	const float4 v1 = vels[triVertId.y];
+	const float4 v2 = vels[triVertId.z];
+
+	const float4 vel = v0 * barycentric.x + v1 * barycentric.y + v2 * barycentric.z;
+
+	return vel;
+}
+
+extern "C" __global__ void cloth_particleContactPrepareLaunch(
+	PxgFEMCloth*					clothes,
+	PxgParticleSystem*				particlesystems,
+	float4*							contacts,
+	float4*							normalPens,
+	float4*							barycentrics,
+	PxgFemOtherContactInfo*			contactInfos,
+	PxU32*							numContacts,
+	PxgFEMParticleConstraintBlock*	spConstraints, //soft body particle constraint
+	float2*							softBodyAppliedForces,
+	float2*							particleAppliedForces
+)
+{
+	const PxU32 tNumContacts = *numContacts;
+
+	const PxU32 nbBlocksRequired = (tNumContacts + blockDim.x - 1) / blockDim.x;
+
+	const PxU32 nbIterationsPerBlock = (nbBlocksRequired + gridDim.x - 1) / gridDim.x;
+
+	const PxU32 idx = threadIdx.x;
+	const PxU32 threadIndexInWarp = threadIdx.x & 31;
+
+	for (PxU32 i = 0; i < nbIterationsPerBlock; ++i)
+	{
+		const PxU32 workIndex = i * blockDim.x + idx + nbIterationsPerBlock * blockIdx.x * blockDim.x;
+
+		if (workIndex >= tNumContacts)
+			return;
+
+		//initialize appliedForces to be zero
+		softBodyAppliedForces[workIndex] = make_float2(0.f, 0.f);
+		particleAppliedForces[workIndex] = make_float2(0.f, 0.f);
+
+		PxgFemOtherContactInfo contactInfo = contactInfos[workIndex];
+		PxgFEMParticleConstraintBlock& constraint = spConstraints[workIndex / 32];
+
+		PxU64 pairInd0 = contactInfo.pairInd0;
+
+		//pairInd0 is a particle system
+		const PxU32 tParticleSystemId = PxGetParticleSystemId(pairInd0);
+		const PxU32 tParticleIndex = PxGetParticleIndex(pairInd0);
+
+		//second one will be cloth
+		PxU32 pairInd1 = PxU32(contactInfo.pairInd1);
+		PxgFEMCloth& cloth = clothes[PxGetClothId(pairInd1)];
+		const PxU32 triangleInd = PxGetClothElementIndex(pairInd1);
+
+		/*printf("workIndex %i particleSystemId %i particleIndex %i\n", workIndex, tParticleSystemId, tParticleIndex);
+		printf("workIndex %i softbodyId %i tetInd %i\n", workIndex, pairInd1.getSoftBodyId(), tetInd);*/
+
+		const uint4 triVertInd = cloth.mTriangleVertexIndices[triangleInd];
+
+		/*	printf("workIndex %i tetrahedronId(%i, %i, %i, %i)\n", workIndex, tetrahedronIdx.x, tetrahedronIdx.y,
+		tetrahedronIdx.z, tetrahedronIdx.w);*/
+
+		//get out the contact point
+		const float4 contact = contacts[workIndex];
+		const float4 normal_pen = normalPens[workIndex];
+
+		/*printf("workIndex %i normal_pen(%f, %f, %f, %f)\n", workIndex, normal_pen.x, normal_pen.y,
+		normal_pen.z, normal_pen.w);*/
+
+		const PxVec3 p(contact.x, contact.y, contact.z);
+
+		/*float4 barycentric;
+		float invMass1 = computeInvMass(tetrahedronIdx, position_invmass, p, barycentric);*/
+		float4 barycentric = barycentrics[workIndex];
+		//float invMass1 = computeClothInvMass(triVertInd, position_invmass, barycentric);
+		
+		const float4 delta1 = computeTriangleContact(cloth.mAccumulatedDeltaPos, triVertInd, barycentric);
+		float invMass1 = delta1.w;		
+
+		const PxVec3 normal(-normal_pen.x, -normal_pen.y, -normal_pen.z);
+
+		PxgParticleSystem& particleSystem = particlesystems[tParticleSystemId];
+		//const float4 position_invMass = particleSystem.mSortedPosition_InvMass[tParticleIndex];
+		//const PxReal invMass0 = position_invMass.w;
+
+		const float4 deltaP_invMass = particleSystem.mSortedDeltaP[tParticleIndex];
+
+		const PxReal invMass0 = deltaP_invMass.w;
+
+		PxVec3 delta(delta1.x - deltaP_invMass.x, delta1.y - deltaP_invMass.y, delta1.z - deltaP_invMass.z);
+
+		const PxReal pen = normal_pen.w + normal.dot(delta) - cloth.mRestDistance;
+
+		//printf("pen = %f, normal_pen.w = %f, normal.dot(delta) = %f, delta1 = (%f, %f, %f), deltaP = (%f, %f, %f)\n", 
+		//	pen, normal_pen.w, normal.dot(delta), delta1.x, delta1.y, delta1.z, deltaP_invMass.x, deltaP_invMass.y, deltaP_invMass.z);
+
+
+		const float unitResponse = invMass0 + invMass1;
+		//KS - perhaps we don't need the > 0.f check here?
+		const float velMultiplier = (unitResponse > 0.f) ? (1.f / unitResponse) : 0.f;
+
+		//PxReal biasedErr = PxMin(-0.5f * pen * invDt, 5.f)*velMultiplier;
+		//PxReal biasedErr = (-0.5f * pen * invDt)*velMultiplier;
+		//printf("biasedErr %f, pen %f, invDt %f, velMultiplier %f\n", biasedErr, pen, invDt, velMultiplier);
+		constraint.normal_pen[threadIndexInWarp] = make_float4(normal.x, normal.y, normal.z, pen);
+		constraint.barycentric[threadIndexInWarp] = barycentric;
+		constraint.velMultiplier[threadIndexInWarp] = velMultiplier;
+	}
+}
+
+
+extern "C" __global__ void cloth_rigidAttachmentPrepareLaunch(
+	PxgFEMCloth*								clothes,
+	PxgFEMRigidAttachment*						rigidAttachments,
+	PxU32*										activeRigidAttachments,
+	PxNodeIndex*								rigidAttachmentIds,
+	PxU32										numRigidAttachments,
+	PxgFEMRigidAttachmentConstraint*			attachmentConstraints,
+	const PxgPrePrepDesc*						preDesc,
+	const PxgConstraintPrepareDesc*				prepareDesc,
+	const PxgSolverSharedDescBase*				sharedDesc,
+	float4*										rigidDeltaVel
+)
+{
+
+	const PxAlignedTransform* bodyFrames = prepareDesc->body2WorldPool;
+
+	const PxU32* solverBodyIndices = preDesc->solverBodyIndices;
+	const PxgSolverBodyData* solverBodyData = prepareDesc->solverBodyDataPool;
+	const PxgSolverTxIData* solverDataTxIPool = prepareDesc->solverBodyTxIDataPool;
+
+
+	const PxgBodySim* bodySims = sharedDesc->mBodySimBufferDeviceData;
+
+	const PxU32 nbBlocksRequired = (numRigidAttachments + blockDim.x - 1) / blockDim.x;
+
+	const PxU32 nbIterationsPerBlock = (nbBlocksRequired + gridDim.x - 1) / gridDim.x;
+
+	const PxU32 idx = threadIdx.x;
+
+
+	for (PxU32 i = 0; i < nbIterationsPerBlock; ++i)
+	{
+		const PxU32 workIndex = i * blockDim.x + idx + nbIterationsPerBlock * blockIdx.x * blockDim.x;
+
+		if (workIndex >= numRigidAttachments)
+			return;
+
+		const PxU32 index = workIndex / 32;
+		const PxU32 offset = workIndex & 31;
+
+		const PxU32 attachmentId = activeRigidAttachments[workIndex];
+
+		const PxgFEMRigidAttachment& attachment = rigidAttachments[attachmentId];
+		PxgFEMRigidAttachmentConstraint& constraint = attachmentConstraints[index];
+
+		const PxU32 elemId = attachment.index1;
+		const PxU32 clothId = PxGetClothId(elemId);
+		const PxU32 elemIdx = PxGetClothElementIndex(elemId);
+		const bool elemIsVertex = PxGetIsVertexType(attachment.baryOrType1);
+
+		PxgFEMCloth& cloth = clothes[clothId];
+
+		const float4* pos_invMass = cloth.mPosition_InvMass;
+		const float4 low_high_limits = attachment.coneLimitParams.low_high_limits;
+		const float4 axis_angle = attachment.coneLimitParams.axis_angle;
+
+		float4 attachmentPose;
+		if (elemIsVertex)
+		{
+			attachmentPose = pos_invMass[elemIdx];
+		}
+		else
+		{
+			const float4 barycentric = attachment.baryOrType1;
+			const uint4 triVertInd = cloth.mTriangleVertexIndices[elemIdx];
+			const float4 pos_iMass0 = pos_invMass[triVertInd.x];
+			const float4 pos_iMass1 = pos_invMass[triVertInd.y];
+			const float4 pos_iMass2 = pos_invMass[triVertInd.z];
+			attachmentPose = pos_iMass0 * barycentric.x + pos_iMass1 * barycentric.y + pos_iMass2 * barycentric.z;
+		}
+
+		float invMass1 = attachmentPose.w;
+		const PxVec3 point(attachmentPose.x, attachmentPose.y, attachmentPose.z);
+		const PxVec3 axis(axis_angle.x, axis_angle.y, axis_angle.z);
+
+		float4 ra4 = attachment.localPose0;
+
+		//nodeIndex
+		PxNodeIndex rigidId = reinterpret_cast<const PxNodeIndex&>(attachment.index0);
+		PxU32 idx = 0;
+		if (!rigidId.isStaticBody())
+		{
+			idx = solverBodyIndices[rigidId.index()];
+		}
+
+		rigidAttachmentIds[workIndex] = rigidId;
+
+		const PxVec3 normal0(1.f, 0.f, 0.f);
+		const PxVec3 normal1(0.f, 1.f, 0.f);
+		const PxVec3 normal2(0.f, 0.f, 1.f);
+
+		if (rigidId.isArticulation())
+		{
+			PxU32 nodeIndexA = rigidId.index();
+			PxU32 artiId = bodySims[nodeIndexA].articulationRemapId;
+
+			PxgArticulation& articulation = sharedDesc->articulations[artiId];
+
+			const PxU32 linkID = rigidId.articulationLinkId();
+			const PxTransform body2World = articulation.linkBody2Worlds[linkID];
+
+			const PxVec3 bodyFrame0p(body2World.p.x, body2World.p.y, body2World.p.z);
+
+			const PxVec3 worldAxis = (body2World.rotate(axis)).getNormalized();
+
+			PxVec3 ra(ra4.x, ra4.y, ra4.z);
+			ra = body2World.rotate(ra);
+			PxVec3 error = ra + bodyFrame0p - point;
+
+			const PxVec3 raXn0 = ra.cross(normal0);
+			const PxVec3 raXn1 = ra.cross(normal1);
+			const PxVec3 raXn2 = ra.cross(normal2);
+
+			PxSpatialMatrix& spatialResponse = articulation.spatialResponseMatrixW[linkID];
+			const Cm::UnAlignedSpatialVector deltaV0 = spatialResponse * Cm::UnAlignedSpatialVector(normal0, raXn0);
+			const Cm::UnAlignedSpatialVector deltaV1 = spatialResponse * Cm::UnAlignedSpatialVector(normal1, raXn1);
+			const Cm::UnAlignedSpatialVector deltaV2 = spatialResponse * Cm::UnAlignedSpatialVector(normal2, raXn2);
+
+			const PxReal resp0 = deltaV0.top.dot(raXn0) + deltaV0.bottom.dot(normal0) + invMass1;
+			const PxReal resp1 = deltaV0.top.dot(raXn1) + deltaV0.bottom.dot(normal1) + invMass1;
+			const PxReal resp2 = deltaV0.top.dot(raXn2) + deltaV0.bottom.dot(normal2) + invMass1;
+
+			const float velMultiplier0 = (resp0 > 0.f) ? (1.f / resp0) : 0.f;
+			const float velMultiplier1 = (resp1 > 0.f) ? (1.f / resp1) : 0.f;
+			const float velMultiplier2 = (resp2 > 0.f) ? (1.f / resp2) : 0.f;
+
+			PxReal biasedErr0 = (error.dot(normal0));
+			PxReal biasedErr1 = (error.dot(normal1));
+			PxReal biasedErr2 = (error.dot(normal2));
+
+			constraint.raXn0_biasW[offset] = make_float4(raXn0.x, raXn0.y, raXn0.z, biasedErr0);
+			constraint.raXn1_biasW[offset] = make_float4(raXn1.x, raXn1.y, raXn1.z, biasedErr1);
+			constraint.raXn2_biasW[offset] = make_float4(raXn2.x, raXn2.y, raXn2.z, biasedErr2);
+			//articulation don't use invMass0. We set it to 1.0 here so that the impulse scaling for the linear impulse
+			//to convert it to a velocity change remains an impulse if it is dealing with an articulation.
+			constraint.velMultiplierXYZ_invMassW[offset] = make_float4(velMultiplier0, velMultiplier1, velMultiplier2, 1.f);
+			constraint.elemId[offset] = elemId;
+			constraint.rigidId[offset] = rigidId.getInd();
+			constraint.baryOrType[offset] = attachment.baryOrType1;
+			constraint.low_high_limits[offset] = low_high_limits;
+			constraint.axis_angle[offset] = make_float4(worldAxis.x, worldAxis.y, worldAxis.z, axis_angle.w);
+
+		}
+		else
+		{
+			//PxMat33 invSqrtInertia0 = solverDataTxIPool[idx].sqrtInvInertia;
+			const float4 linVel_invMass0 = solverBodyData[idx].initialLinVelXYZ_invMassW;
+			const PxReal invMass0 = linVel_invMass0.w;
+
+			PxMat33 invSqrtInertia0;
+			PxReal inertiaScale = 1.f;
+			if (invMass0 == 0.f && !rigidId.isStaticBody())
+			{
+				invSqrtInertia0 = PxMat33(PxIdentity);
+				inertiaScale = 0.f;
+			}
+			else
+			{
+				invSqrtInertia0 = solverDataTxIPool[idx].sqrtInvInertia;
+			}
+
+			PxAlignedTransform bodyFrame0 = bodyFrames[idx];
+			const PxVec3 bodyFrame0p(bodyFrame0.p.x, bodyFrame0.p.y, bodyFrame0.p.z);
+
+			PxVec3 ra(ra4.x, ra4.y, ra4.z);
+			ra = bodyFrame0.rotate(ra);
+			PxVec3 error = ra + bodyFrame0p - point;
+
+			const PxVec3 worldAxis = (bodyFrame0.rotate(axis)).getNormalized();
+			
+
+			const PxVec3 raXn0 = ra.cross(normal0);
+			const PxVec3 raXn1 = ra.cross(normal1);
+			const PxVec3 raXn2 = ra.cross(normal2);
+
+			const PxVec3 raXnSqrtInertia0 = invSqrtInertia0 * raXn0;
+			const PxVec3 raXnSqrtInertia1 = invSqrtInertia0 * raXn1;
+			const PxVec3 raXnSqrtInertia2 = invSqrtInertia0 * raXn2;
+			const float resp0 = (raXnSqrtInertia0.dot(raXnSqrtInertia0))*inertiaScale + invMass0 + invMass1;
+			const float resp1 = (raXnSqrtInertia1.dot(raXnSqrtInertia1))*inertiaScale + invMass0 + invMass1;
+			const float resp2 = (raXnSqrtInertia2.dot(raXnSqrtInertia2))*inertiaScale + invMass0 + invMass1;
+
+			const float velMultiplier0 = (resp0 > 0.f) ? (1.f / resp0) : 0.f;
+			const float velMultiplier1 = (resp1 > 0.f) ? (1.f / resp1) : 0.f;
+			const float velMultiplier2 = (resp2 > 0.f) ? (1.f / resp2) : 0.f;
+
+			PxReal biasedErr0 = (error.dot(normal0));
+			PxReal biasedErr1 = (error.dot(normal1));
+			PxReal biasedErr2 = (error.dot(normal2));
+
+			constraint.raXn0_biasW[offset] = make_float4(raXnSqrtInertia0.x, raXnSqrtInertia0.y, raXnSqrtInertia0.z, biasedErr0);
+			constraint.raXn1_biasW[offset] = make_float4(raXnSqrtInertia1.x, raXnSqrtInertia1.y, raXnSqrtInertia1.z, biasedErr1);
+			constraint.raXn2_biasW[offset] = make_float4(raXnSqrtInertia2.x, raXnSqrtInertia2.y, raXnSqrtInertia2.z, biasedErr2);
+			constraint.velMultiplierXYZ_invMassW[offset] = make_float4(velMultiplier0, velMultiplier1, velMultiplier2, invMass0);
+			constraint.elemId[offset] = elemId;
+			constraint.rigidId[offset] = rigidId.getInd();
+			constraint.baryOrType[offset] = attachment.baryOrType1;
+			constraint.low_high_limits[offset] = low_high_limits;
+			constraint.axis_angle[offset] = make_float4(worldAxis.x, worldAxis.y, worldAxis.z, axis_angle.w);
+
+			if (rigidDeltaVel)
+			{
+				rigidDeltaVel[workIndex] = make_float4(0.f);
+				rigidDeltaVel[workIndex + numRigidAttachments] = make_float4(0.f);
+			}
+		}
+	}
+}
+
--- a/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/FEMClothExternalSolve.cu
+++ b/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/FEMClothExternalSolve.cu
--- a/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/FEMClothUtil.cuh
+++ b/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/FEMClothUtil.cuh
@@ -0,0 +1,705 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __CU_FEMCLOTHUTIL_CUH__
+#define __CU_FEMCLOTHUTIL_CUH__
+
+#include "PxgFEMCloth.h"
+#include "vector_types.h"
+#include "foundation/PxVec3.h"
+#include "foundation/PxVec4.h"
+#include "foundation/PxBounds3.h"
+#include "copy.cuh"
+#include "shuffle.cuh"
+#include "assert.h"
+#include "stdio.h"
+#include "PxgFEMClothCoreKernelIndices.h"
+#include "atomic.cuh"
+#include "PxsDeformableSurfaceMaterialCore.h"
+#include "femMidphaseScratch.cuh"
+#include "GuBV32.h"
+#include "deformableUtils.cuh"
+#include "particleSystem.cuh"
+#include "utils.cuh"
+
+
+using namespace physx;
+
+
+/*******************************************************************************
+ *
+ * 
+ * Definitions
+ * 
+ * 
+ ******************************************************************************/
+ 
+#define FEMCLOTH_SQRT2		1.4142135623730950488016887242097f
+#define FEMCLOTH_SQRT3		1.7320508075688772935274463415059f
+
+#define FEMCLOTH_THRESHOLD	1.0e-14f
+#define FEMCLOTH_PI			3.14159265358979323846f
+#define FEMCLOTH_HALF_PI	1.57079632679489661923f
+#define FEMCLOTH_2PI		6.28318530717958647692f
+#define FEMCLOTH_2PI_INV	0.15915494309189533576888376337251f
+
+
+
+
+
+/*******************************************************************************
+ *
+ *
+ * Math functions
+ *
+ *
+ ******************************************************************************/
+
+//! 
+//! \brief    : Extract rotation R [r0, r1] from F [f0, f1] in 2D 
+//! \reference: https://en.wikipedia.org/wiki/Square_root_of_a_2_by_2_matrix
+//! 
+
+static PX_FORCE_INLINE __device__ void extractRotation2D(PxVec2& r0, PxVec2& r1, const PxVec2& f0, const PxVec2& f1)
+{
+	// R: rotation part of F (by polar decopmosition)
+	// F^T * F = [S2[0], S2[2]]
+	//           [S2[2], S2[1]]
+	const PxVec3 S2(f0.dot(f0), f1.dot(f1), f0.dot(f1));
+	const float det = S2[0] * S2[1] - S2[2] * S2[2];
+
+	if (det < FEMCLOTH_THRESHOLD)
+	{
+		r0.x = 1.0f;
+		r0.y = 0.0f;
+		r1.x = 0.0f;
+		r1.y = 1.0f;
+		return;
+	}
+
+	const float s0 = sqrtf(det);
+	const float t = sqrtf(S2[0] + S2[1] + 2.0f * s0);
+	
+	assert(t > 0.0f);
+	if (t < FEMCLOTH_THRESHOLD)
+	{
+		r0.x = 1.0f;
+		r0.y = 0.0f;
+		r1.x = 0.0f;
+		r1.y = 1.0f;
+		return;
+	}
+	const float tInv = 1.0f / t;
+
+	PxVec3 S(S2);
+	S[0] += s0;
+	S[1] += s0;
+	S *= tInv;
+
+	const float sDet = S[0] * S[1] - S[2] * S[2];
+	assert(sDet > 0.0f);
+	
+	if (sDet < FEMCLOTH_THRESHOLD)
+	{
+		r0.x = 1.0f;
+		r0.y = 0.0f;
+		r1.x = 0.0f;
+		r1.y = 1.0f;
+		return;
+	}
+
+	const float sDetInv = 1.0f / sDet;
+
+	PxVec3 SInv(S[1], S[0], -S[2]);
+	SInv *= sDetInv;
+
+	// R = [r0 r1]
+	r0 = SInv[0] * f0 + SInv[2] * f1;
+	r1 = SInv[2] * f0 + SInv[1] * f1;
+}
+
+
+
+//!
+//! \brief    : Approximated atan2: max error of ~1/10000
+//! \reference: https://mazzo.li/posts/vectorized-atan2.html
+//!
+
+static PX_FORCE_INLINE __device__ PxReal atanApprox(PxReal x)
+{
+	PxReal x2 = x * x;
+	return x * (0.99997726f + x2 * (-0.33262347f + x2 * (0.19354346f + x2 * (-0.11643287f + x2 * (0.05265332f + x2 * (-0.01172120f))))));
+}
+
+static PX_FORCE_INLINE __device__ PxReal atan2Approx(PxReal y, PxReal x)
+{
+	bool swap = PxAbs(x) < PxAbs(y);
+	PxReal input = swap ? (x / y) : (y / x);
+	PxReal output = atanApprox(input);
+
+	output = swap ? (input >= 0.f ? FEMCLOTH_HALF_PI : -FEMCLOTH_HALF_PI) - output : output;
+
+	if(x < 0.f)
+	{
+		output += (y >= 0.f ? FEMCLOTH_PI : -FEMCLOTH_PI);
+	}
+
+	return output;
+}
+
+static PX_FORCE_INLINE __device__ bool velocityClamping(float4& pos, float4& vel, float4& accumDelta, PxReal maxVel, PxReal dt,
+														const float4& prevPos)
+{
+	const PxReal maxVelSq = maxVel * maxVel;
+	const PxReal velMagSq = PxLoad3(vel).magnitudeSquared();
+	if (velMagSq > maxVelSq)
+	{
+		vel *= maxVel / PxSqrt(velMagSq);
+		float4 newPos = prevPos + vel * dt;
+		newPos.w = pos.w;
+		vel.w = pos.w;
+
+		const float4 delta = newPos - pos;
+		pos = newPos;
+		accumDelta += delta;
+		return true; // Velocity is clamped.
+	}
+
+	return false; // Velocity is not clamped.
+}
+
+
+
+
+/*******************************************************************************
+ *
+ *
+ * Delta lambda updates
+ *
+ *
+ ******************************************************************************/
+
+//! 
+//! \brief    : Returns the delta lambda in XPBD when a constraint has three vertex degrees of freedom, applicable in both 2D and 3D but without damping.
+//! 
+
+template <typename PxVec2Or3>
+static PX_FORCE_INLINE __device__ float queryDeltaLambda(float C, const PxVec2Or3& dCdx0, const PxVec2Or3& dCdx1, const PxVec2Or3& dCdx2,
+														 float alphaTilde, float lambda, float massInv0, float massInv1, float massInv2)
+{
+	const float denom =
+		(massInv0 * dCdx0.magnitudeSquared() + massInv1 * dCdx1.magnitudeSquared() + massInv2 * dCdx2.magnitudeSquared()) + alphaTilde;
+	assert(denom != 0.0f);
+
+	if (denom < FEMCLOTH_THRESHOLD)
+		return 0.0f;
+
+	return (-C - alphaTilde * lambda) / denom;
+}
+
+//! 
+//! \brief    : Returns the delta lambda in XPBD when a constraint has three vertex degrees of freedom, applicable in both 2D and 3D with damping.
+//! 
+
+template <typename PxVec2Or3>
+static PX_FORCE_INLINE __device__ float queryDeltaLambda(float C, const PxVec2Or3& dCdx0, const PxVec2Or3& dCdx1, const PxVec2Or3& dCdx2,
+														 float alphaTilde, float lambda, float massInv0, float massInv1, float massInv2,
+														 float damping, float dtInv, float dCdT)
+{
+	const float denom = (1.0f + damping * dtInv) * (massInv0 * dCdx0.magnitudeSquared() + massInv1 * dCdx1.magnitudeSquared() +
+													massInv2 * dCdx2.magnitudeSquared()) +
+						alphaTilde;
+	assert(denom != 0.0f);
+
+	if (denom < FEMCLOTH_THRESHOLD)
+		return 0.0f;
+
+	return -(C + alphaTilde * lambda + damping * dCdT) / denom;
+}
+
+//! 
+//! \brief    : Returns the delta lambda in XPBD when a constraint has four vertex degrees of freedom, applicable in 3D but without damping.
+//! 
+
+static PX_FORCE_INLINE __device__ float queryDeltaLambda(float C, const PxVec3& dCdx0, const PxVec3& dCdx1, const PxVec3& dCdx2,
+														 const PxVec3& dCdx3, float alphaTilde, float lambda, float massInv0,
+														 float massInv1, float massInv2, float massInv3)
+{
+	const float denom = (massInv0 * dCdx0.magnitudeSquared() + massInv1 * dCdx1.magnitudeSquared() + massInv2 * dCdx2.magnitudeSquared() +
+						 massInv3 * dCdx3.magnitudeSquared()) +
+						alphaTilde;
+	assert(denom != 0.0f);
+
+	if (denom < FEMCLOTH_THRESHOLD)
+		return 0.0f;
+
+	return (-C - alphaTilde * lambda) / denom;
+}
+
+//! 
+//! \brief    : Returns the delta lambda in XPBD when a constraint has four vertex degrees of freedom, applicable in 3D with damping.
+//! 
+
+static PX_FORCE_INLINE __device__ float queryDeltaLambda(float C, const PxVec3& dCdx0, const PxVec3& dCdx1, const PxVec3& dCdx2,
+														 const PxVec3& dCdx3, float alphaTilde, float lambda, float massInv0, float massInv1,
+														 float massInv2, float massInv3, float damping, float dtInv, const float dCdT)
+{
+	const float denom = (1.0f + damping * dtInv) * (massInv0 * dCdx0.magnitudeSquared() + massInv1 * dCdx1.magnitudeSquared() +
+													massInv2 * dCdx2.magnitudeSquared() + massInv3 * dCdx3.magnitudeSquared()) +
+						alphaTilde;
+	assert(denom != 0.0f);
+
+	if (denom < FEMCLOTH_THRESHOLD)
+		return 0.0f;
+
+	return -(C + alphaTilde * lambda + damping * dCdT) / denom;
+}
+
+
+
+
+
+/*******************************************************************************
+ *
+ *
+ * Deformation gradient and its derivatives
+ *
+ *
+ ******************************************************************************/
+
+//! 
+//! \brief    : query deformation gradients (F \in R^{2x2})
+//! 
+
+static PX_FORCE_INLINE __device__ void queryDeformationGradient_F2x2(PxVec2& f0, PxVec2& f1, const float4& QInv, const PxVec2& xp01,
+																	 const PxVec2& xp02)
+{
+	f0 = QInv.x * xp01 + QInv.y * xp02;
+	f1 = QInv.z * xp01 + QInv.w * xp02;
+}
+
+
+
+//! 
+//! \brief    : compute gradient of constraint (F \in R^{2x2})
+//! 
+
+static PX_FORCE_INLINE __device__ void queryConstraintGradient_F2x2(PxVec2& grad1, PxVec2& grad2, const float4& qInv, const PxVec2& pC_pF0,
+																	const PxVec2& pC_pF1)
+{
+	grad1 = qInv.x * pC_pF0 + qInv.z * pC_pF1;
+	grad2 = qInv.y * pC_pF0 + qInv.w * pC_pF1;
+}
+
+
+
+
+
+/*******************************************************************************
+ *
+ *
+ * Constraint functions
+ *
+ *
+ ******************************************************************************/
+
+ //! 
+ //! \brief    : As-Rigid-As-Possible constraint using {sqrt(||F - R||_F^2)}, F \in R^{2x2}
+ //! 
+
+static inline __device__ void ARAPConstraint_F2X2(float& lambda, PxVec2& dx0, PxVec2& dx1, PxVec2& dx2, float alphaTilde,
+												  const float4& QInv, const PxVec2& x01, const PxVec2& x02, float massInv0, float massInv1,
+												  float massInv2, const PxgFEMCloth& shFEMCloth)
+{
+	PxVec2 f0, f1, r0, r1;      // F = [f0 f1], R = [r0 r1]
+	PxVec2 grad0, grad1, grad2; // gradient of constraint
+
+	queryDeformationGradient_F2x2(f0, f1, QInv, x01, x02);
+	extractRotation2D(r0, r1, f0, f1);
+
+	PxVec2 FMinusR0 = f0 - r0;
+	PxVec2 FMinusR1 = f1 - r1;
+
+	const float C = sqrt(FMinusR0.dot(FMinusR0) + FMinusR1.dot(FMinusR1)); // ARAP constraint
+
+	if(C > FEMCLOTH_THRESHOLD)
+	{
+		const float CInv = 1.0f / C;
+
+		// pC/pF = [pCA_pF0 pCA_pF1]
+		const PxVec2 pC_pF0 = CInv * FMinusR0;
+		const PxVec2 pC_pF1 = CInv * FMinusR1;
+
+		queryConstraintGradient_F2x2(grad1, grad2, QInv, pC_pF0, pC_pF1);
+		grad0 = -grad1 - grad2;
+
+		const float deltaLambda = queryDeltaLambda(C, grad0, grad1, grad2, alphaTilde, lambda, massInv0, massInv1, massInv2);
+		lambda += deltaLambda;
+
+		dx0 += massInv0 * deltaLambda * grad0;
+		dx1 += massInv1 * deltaLambda * grad1;
+		dx2 += massInv2 * deltaLambda * grad2;
+	}
+}
+
+
+
+//! 
+//! \brief    : Area conservation constraint
+//! 
+
+static inline __device__ void areaConstraint_F2X2(float& lambda, PxVec2& dx0, PxVec2& dx1, PxVec2& dx2, float alphaTilde,
+												  const float4& QInv, const PxVec2& x01, const PxVec2& x02, float massInv0, float massInv1,
+												  float massInv2, float area, const PxgFEMCloth& shFEMCloth)
+{
+#if 1
+
+	// Area constraints
+	// C = |x01 X x02| / |u01 X u02| - 1.0
+	const PxReal x01CrossX02 = x01.x * x02.y - x01.y * x02.x;
+	const float undeformedAreaInv = 1.0f / area;
+
+	const float C = 0.5f * x01CrossX02 * undeformedAreaInv - 1.0f;
+
+	const PxVec2 grad1(0.5f * undeformedAreaInv * x02.y, -0.5f * undeformedAreaInv * x02.x);
+	const PxVec2 grad2(-0.5f * undeformedAreaInv * x01.y, 0.5f * undeformedAreaInv * x01.x);
+	const PxVec2 grad0 = -grad1 - grad2;
+
+#else
+
+	// Area constraints
+	// C = det(F) - 1, F \in R^ { 2x2 }
+	PxVec2 f0, f1, r0, r1;		// F = [f0 f1], R = [r0 r1]
+	PxVec2 grad0, grad1, grad2; // gradient of constraint
+
+	queryDeformationGradient_F2x2(f0, f1, QInv, x01, x02);
+
+	const PxReal C = f0.x * f1.y - f0.y * f1.x - 1.0f;
+
+	// pC/pF = [pCA_pF0 pCA_pF1]
+	const PxVec2 pC_pF0(f1.y, -f0.y);
+	const PxVec2 pC_pF1(-f1.x, f0.x);
+
+	queryConstraintGradient_F2x2(grad1, grad2, QInv, pC_pF0, pC_pF1);
+	grad0 = -grad1 - grad2;
+
+#endif
+
+	const float deltaLambda = queryDeltaLambda(C, grad0, grad1, grad2, alphaTilde, lambda, massInv0, massInv1, massInv2);
+	lambda += deltaLambda;
+
+	dx0 += massInv0 * deltaLambda * grad0;
+	dx1 += massInv1 * deltaLambda * grad1;
+	dx2 += massInv2 * deltaLambda * grad2;
+}
+
+
+
+
+
+/*******************************************************************************
+ *
+ *
+ * Energy models
+ *
+ *
+ ******************************************************************************/
+
+//! 
+//! \brief    : XPBD formulation of fixed corotated model 
+//! 
+
+static __device__ inline void membraneEnergySolvePerTriangle(PxgFEMCloth& shFEMCloth, float4& xx0, float4& xx1, float4& xx2, PxReal dt,
+															 const PxsDeformableSurfaceMaterialData& material, const float4& QInv,
+															 float vertexScale0, float vertexScale1, float vertexScale2, PxU32 lambdaIndex,
+															 bool isShared, bool isTGS)
+{
+	if (material.youngs < FEMCLOTH_THRESHOLD)
+	{
+		return;
+	}
+
+	PxVec3 x0 = PxLoad3(xx0);
+	PxVec3 x1 = PxLoad3(xx1);
+	PxVec3 x2 = PxLoad3(xx2);
+
+	const PxVec3 x01 = x1 - x0;
+	const PxVec3 x02 = x2 - x0;
+
+	const PxVec3 axis0 = x01.getNormalized();
+	PxVec3 normal = x01.cross(x02);
+	const PxVec3 axis1 = (normal.cross(axis0)).getNormalized();
+
+	const PxReal dt2 = dt * dt;
+
+	const PxReal det = QInv.x * QInv.w - QInv.y * QInv.z;
+	const PxReal area = 1.0f / (2.0f * det);
+	const PxReal volume = area * material.thickness;
+
+	PxVec2 dx0(0.0f), dx1(0.0f), dx2(0.0f);
+	float lambda0 = 0.0f, lambda1 = 0.0f;
+
+	if (!isTGS)
+	{
+		lambda0 = isShared ? shFEMCloth.mOrderedSharedTriangleLambdas[lambdaIndex].x : shFEMCloth.mOrderedNonSharedTriangleLambdas[lambdaIndex].x;
+		lambda1 = isShared ? shFEMCloth.mOrderedSharedTriangleLambdas[lambdaIndex].y : shFEMCloth.mOrderedNonSharedTriangleLambdas[lambdaIndex].y;
+	}
+
+	// Lame's parameters
+	const PxPair<PxReal, PxReal> lames = lameParameters(material.youngs, material.poissons);
+
+	// 1) enforcing ARAP constraint
+	PxVec2 xp01(axis0.dot(x01), axis1.dot(x01));
+	PxVec2 xp02(axis0.dot(x02), axis1.dot(x02));
+
+	// Lame's second parameters
+	const PxReal mu = lames.second; 
+	const PxReal alphaTilde0 = 1.0f / (2.0f * mu * volume * dt2);
+
+	ARAPConstraint_F2X2(lambda0, dx0, dx1, dx2, alphaTilde0, QInv, xp01, xp02, vertexScale0 * xx0.w, vertexScale1 * xx1.w,
+						vertexScale2 * xx2.w, shFEMCloth);
+
+	// 2) enforcing area constraint
+	if (material.poissons > FEMCLOTH_THRESHOLD)
+	{
+		PxReal alphaTilde1 = 0.0f;
+
+		if(material.poissons < 0.5f - FEMCLOTH_THRESHOLD)
+		{
+			// Lame's first parameters
+			const PxReal lambda = lames.first;
+			alphaTilde1 = 1.0f / (lambda * volume * dt2);
+		}
+
+		xp01 += dx1 - dx0;
+		xp02 += dx2 - dx0;
+
+		areaConstraint_F2X2(lambda1, dx0, dx1, dx2, alphaTilde1, QInv, xp01, xp02, vertexScale0 * xx0.w, vertexScale1 * xx1.w,
+							vertexScale2 * xx2.w, area, shFEMCloth);
+	}
+
+	x0 += dx0.x * axis0 + dx0.y * axis1;
+	x1 += dx1.x * axis0 + dx1.y * axis1;
+	x2 += dx2.x * axis0 + dx2.y * axis1;
+
+	if (!isTGS)
+	{
+		if (isShared)
+		{
+			shFEMCloth.mOrderedSharedTriangleLambdas[lambdaIndex].x = lambda0;
+			shFEMCloth.mOrderedSharedTriangleLambdas[lambdaIndex].y = lambda1;
+		}
+		else
+		{
+			shFEMCloth.mOrderedNonSharedTriangleLambdas[lambdaIndex].x = lambda0;
+			shFEMCloth.mOrderedNonSharedTriangleLambdas[lambdaIndex].y = lambda1;
+		}
+	}
+
+	xx0.x = x0.x;
+	xx0.y = x0.y;
+	xx0.z = x0.z;
+
+	xx1.x = x1.x;
+	xx1.y = x1.y;
+	xx1.z = x1.z;
+
+	xx2.x = x2.x;
+	xx2.y = x2.y;
+	xx2.z = x2.z;
+}
+
+
+
+//! 
+//! \brief    : XPBD formulation of "Discrete Shells" 
+//! 
+
+static __device__ inline void bendingEnergySolvePerTrianglePair(PxgFEMCloth& shFEMCloth, float4& x0, float4& x1, float4& x2, float4& x3,
+																const float4& vertexReferenceCounts, float dt, PxU32 trianglePairIndex,
+																bool isSharedTrianglePartition, bool isTGS)
+{
+	const PxVec3 x02 = PxLoad3(x2 - x0);
+	const PxVec3 x03 = PxLoad3(x3 - x0);
+	const PxVec3 x13 = PxLoad3(x3 - x1);
+	const PxVec3 x12 = PxLoad3(x2 - x1);
+	const PxVec3 x23 = PxLoad3(x3 - x2);
+	const PxReal x23Len = x23.magnitude();
+
+	if(x23Len < FEMCLOTH_THRESHOLD)
+		return;
+
+	const PxReal x23LenInv = 1.f / x23Len;
+	const PxVec3 x23Normalized = x23 * x23LenInv;
+
+	const float4 restBendingAngle_flexuralStiffness_damping =
+		isSharedTrianglePartition ? shFEMCloth.mOrderedSharedRestBendingAngle_flexuralStiffness_damping[trianglePairIndex]
+								  : shFEMCloth.mOrderedNonSharedRestBendingAngle_flexuralStiffness_damping[trianglePairIndex];
+
+	const PxReal restBendingAngle = restBendingAngle_flexuralStiffness_damping.x;
+	const PxReal kInv = restBendingAngle_flexuralStiffness_damping.y;
+
+	if (kInv <= 0.f)
+		return;
+
+	//const PxReal damping = restBendingAngle_flexuralStiffness_damping.z;
+
+	const PxVec3 scaledN0 = x02.cross(x03);
+	const PxVec3 scaledN1 = x13.cross(x12);
+
+	const PxReal n0LenInv = 1.f / scaledN0.magnitude();
+	const PxReal n1LenInv = 1.f / scaledN1.magnitude();
+
+	const PxVec3 n0 = scaledN0 * n0LenInv;
+	PxVec3 n1 = scaledN1 * n1LenInv;
+
+	const PxReal cosAngle = n0.dot(n1);
+	const PxReal sinAngle = n0.cross(n1).dot(x23Normalized);
+	PxReal angle = atan2f(sinAngle, cosAngle);
+
+	PxReal C = 0.f;
+	PxReal alphaTilde = 0.f;
+	float dtInv = 1.0f / dt;
+
+	alphaTilde = kInv * dtInv * dtInv;
+	C = angle - restBendingAngle;
+
+	if(PxAbs(C + FEMCLOTH_2PI) < PxAbs(C))
+	{
+		C += FEMCLOTH_2PI;
+	}
+	else if(PxAbs(C - FEMCLOTH_2PI) < PxAbs(C))
+	{
+		C -= FEMCLOTH_2PI;
+	}
+
+	// Bending constraint clamped.
+	C = PxClamp(C, -FEMCLOTH_HALF_PI, FEMCLOTH_HALF_PI);
+	const PxVec3 temp0 = n0 * n0LenInv;
+	const PxVec3 temp1 = n1 * n1LenInv;
+	const PxVec3 dCdx0 = -x23Len * temp0;
+	const PxVec3 dCdx1 = -x23Len * temp1;
+	const PxVec3 dCdx2 = x03.dot(x23Normalized) * temp0 + x13.dot(x23Normalized) * temp1;
+	const PxVec3 dCdx3 = -(x02.dot(x23Normalized) * temp0 + x12.dot(x23Normalized) * temp1);
+
+	PxReal lambda = 0.0f;
+
+	if (!isTGS)
+	{
+		lambda = isSharedTrianglePartition ? shFEMCloth.mSharedBendingLambdas[trianglePairIndex] :
+			shFEMCloth.mNonSharedBendingLambdas[trianglePairIndex];
+	}
+
+	float deltaLambda =
+		queryDeltaLambda(C, dCdx0, dCdx1, dCdx2, dCdx3, alphaTilde, lambda, vertexReferenceCounts.x * x0.w, vertexReferenceCounts.y * x1.w,
+						 vertexReferenceCounts.z * x2.w, vertexReferenceCounts.w * x3.w);
+
+	if (!isTGS)
+	{
+		if (isSharedTrianglePartition)
+		{
+			shFEMCloth.mSharedBendingLambdas[trianglePairIndex] = lambda + deltaLambda;
+		}
+		else
+		{
+			shFEMCloth.mNonSharedBendingLambdas[trianglePairIndex] = lambda + deltaLambda;
+		}
+	}
+
+	PxReal scale0 = vertexReferenceCounts.x * x0.w * deltaLambda;
+	x0.x += scale0 * dCdx0.x;
+	x0.y += scale0 * dCdx0.y;
+	x0.z += scale0 * dCdx0.z;
+
+	PxReal scale1 = vertexReferenceCounts.y * x1.w * deltaLambda;
+	x1.x += scale1 * dCdx1.x;
+	x1.y += scale1 * dCdx1.y;
+	x1.z += scale1 * dCdx1.z;
+
+	PxReal scale2 = vertexReferenceCounts.z * x2.w * deltaLambda;
+	x2.x += scale2 * dCdx2.x;
+	x2.y += scale2 * dCdx2.y;
+	x2.z += scale2 * dCdx2.z;
+
+	PxReal scale3 = vertexReferenceCounts.w * x3.w * deltaLambda;
+	x3.x += scale3 * dCdx3.x;
+	x3.y += scale3 * dCdx3.y;
+	x3.z += scale3 * dCdx3.z;
+
+	return;
+}
+
+//! 
+//! \brief    : Cloth shell energies in a triangle-pair (two adjacent triangles): in-plane + bending
+//! 
+
+static __device__ inline 
+void
+	clothSharedEnergySolvePerTrianglePair(PxgFEMCloth& shFEMCloth, float4& x0, float4& x1, float4& x2, float4& x3,
+										  const float4& vertexReferenceCount, const PxsDeformableSurfaceMaterialData* PX_RESTRICT clothMaterials,
+										  float dt, PxU32 trianglePairIndex, bool isTGS)
+{
+	// shared edge: the shared edge between two adjacent triangles (triangle0, triangle1).
+	// edge0, edge1: non-shared edge in triangle0 and triangle1, respectively.
+	// tri0Count, tri1Count: the number of references to triangle0 and triangle1 in the entire triangle pairs.
+	const float4 restData0 = shFEMCloth.mOrderedSharedRestEdge0_edge1[trianglePairIndex];
+	const float4 restData1 = shFEMCloth.mOrderedSharedRestEdgeLength_material0_material1[trianglePairIndex];
+
+	const PxU32 globalMaterialIndex0 = static_cast<PxU32>(restData1.y);
+	const PxU32 globalMaterialIndex1 = static_cast<PxU32>(restData1.z);
+
+	const PxVec2 restEdge0(restData0.x, restData0.y);
+	const PxVec2 restEdge1(restData0.z, restData0.w);
+	const float restSharedEdgeLength = restData1.x;
+
+	const float det0 = restSharedEdgeLength * restEdge0.y;
+	const float det1 = restSharedEdgeLength * restEdge1.y;
+
+	// In-plane constraint for triangle0 with vertex x2, x3, and x0.
+	if(PxAbs(det0) > FEMCLOTH_THRESHOLD)
+	{
+		const PxU32 lambdaIndex = 2*trianglePairIndex;
+		const float4 QInv0 = make_float4(restEdge0.y, 0.0f, -restEdge0.x, restSharedEdgeLength) / det0;
+		membraneEnergySolvePerTriangle(shFEMCloth, x2, x3, x0, dt, clothMaterials[globalMaterialIndex0], QInv0, vertexReferenceCount.z,
+									   vertexReferenceCount.w, vertexReferenceCount.x, lambdaIndex, true, isTGS);
+	}
+
+	// In-plane constraint for triangle1 with vertex x2, x3, and x1.
+	if(PxAbs(det1) > FEMCLOTH_THRESHOLD)
+	{
+		const PxU32 lambdaIndex = 2 * trianglePairIndex + 1;
+		const float4 QInv1 = make_float4(restEdge1.y, 0.0f, -restEdge1.x, restSharedEdgeLength) / det1;
+		membraneEnergySolvePerTriangle(shFEMCloth, x2, x3, x1, dt, clothMaterials[globalMaterialIndex1], QInv1, vertexReferenceCount.z,
+									   vertexReferenceCount.w, vertexReferenceCount.y, lambdaIndex, true, isTGS);
+	}
+
+	// Bending constraint for the triangle pair
+	bendingEnergySolvePerTrianglePair(shFEMCloth, x0, x1, x2, x3, vertexReferenceCount, dt, trianglePairIndex, true, isTGS);
+}
+
+#endif  // FEMCLOTHUTIL
--- a/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/SDFConstruction.cu
+++ b/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/SDFConstruction.cu
@@ -0,0 +1,790 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxVec3.h"
+#include "foundation/PxVec4.h"
+
+#include "atomic.cuh"
+#include "reduction.cuh"
+#include "bvh.cuh"
+#include "GuSDF.h"
+#include "utils.cuh"
+
+using namespace physx;
+
+extern "C" __host__ void initSdfConstructionKernels0() {}
+
+PX_FORCE_INLINE __device__ PxVec3 getCenter(PxU32 index, const float4* PX_RESTRICT itemLowers, const float4* PX_RESTRICT itemUppers)
+{
+	PxVec3 lower = PxLoad3(itemLowers[index]);
+	PxVec3 upper = PxLoad3(itemUppers[index]);
+
+	PxVec3 center = 0.5f*(lower + upper);
+	return center;
+}
+
+extern "C" __global__ void bvhCalculateMortonCodes(const float4* PX_RESTRICT itemLowers, const float4* PX_RESTRICT itemUppers, const PxI32* PX_RESTRICT itemPriorities, PxI32 n,
+	const PxVec3* gridLower, const PxVec3* gridInvEdges, PxI32* PX_RESTRICT indices, PxI32* PX_RESTRICT keys)
+{
+	const PxI32 index = blockDim.x*blockIdx.x + threadIdx.x;
+
+	if (index < n)
+	{
+		PxVec3 center = getCenter(index, itemLowers, itemUppers);
+
+		PxVec3 local = (center - gridLower[0]).multiply(gridInvEdges[0]);
+
+		PxI32 key;
+		if (itemPriorities)
+		{
+			// 9-bit Morton codes stored in lower 27bits (512^3 effective resolution)
+			// 5-bit priority code stored in the upper 5-bits			
+			key = morton3<512>(local.x, local.y, local.z);
+
+			// we invert priorities (so that higher priority items appear first in sorted order)
+			key |= (~itemPriorities[index]) << 27;
+		}
+		else
+		{
+			key = morton3<1024>(local.x, local.y, local.z);
+		}
+
+		indices[index] = index;
+		keys[index] = key;
+	}
+}
+
+// calculate the index of the first differing bit between two adjacent Morton keys
+extern "C" __global__ void bvhCalculateKeyDeltas(const PxI32* PX_RESTRICT keys, PxReal* PX_RESTRICT deltas, PxI32 n)
+{
+	const PxI32 index = blockDim.x*blockIdx.x + threadIdx.x;
+
+	if (index + 1 < n)
+	{
+		PxI32 a = keys[index];
+		PxI32 b = keys[index + 1];
+
+		//if (a > b)
+		//	printf("Elements not sorted\n");
+
+		PxI32 x = a ^ b;
+
+		deltas[index] = PxReal(x); // reinterpret_cast<PxReal&>(x);// This should work since x is positive
+	}
+}
+
+// calculate the index of the first differing bit between two adjacent Morton keys
+extern "C" __global__ void bvhCalculateKeyDeltasSquaredDistance(const PxI32* PX_RESTRICT keys, PxReal* PX_RESTRICT deltas, PxI32 n,
+	const float4* PX_RESTRICT itemLowers, const float4* PX_RESTRICT itemUppers)
+{
+	const PxI32 index = blockDim.x*blockIdx.x + threadIdx.x;
+
+	if (index + 1 < n)
+	{
+		//PxI32 a = keys[index];
+		//PxI32 b = keys[index + 1];
+
+		//itemLowers and itemUppers must be in sorted order
+		PxVec3 centerA = getCenter(index, itemLowers, itemUppers);
+		PxVec3 centerB = getCenter(index + 1, itemLowers, itemUppers);
+		PxReal distanceSquared = (centerA - centerB).magnitudeSquared();
+
+		//if (a > b)
+		//	printf("Elements not sorted\n");
+
+		//PxI32 x = a ^ b;
+
+		deltas[index] = distanceSquared;
+	}
+}
+
+extern "C" __global__ void bvhBuildLeaves(const float4* PX_RESTRICT itemLowers, const float4* PX_RESTRICT itemUppers,
+	PxI32 n, const PxI32* PX_RESTRICT indices, PxI32* PX_RESTRICT rangeLefts, PxI32* PX_RESTRICT rangeRights, PxgPackedNodeHalf* PX_RESTRICT lowers, PxgPackedNodeHalf* PX_RESTRICT uppers)
+{
+	const PxI32 index = blockDim.x*blockIdx.x + threadIdx.x;
+
+	if (index < n)
+	{
+		const PxI32 item = indices[index];
+
+		const PxVec3 lower = PxLoad3(itemLowers[item]);
+		const float4 upper = itemUppers[item];
+
+		// write leaf nodes 
+		lowers[index] = makeNode(lower, item, true);
+		uppers[index] = makeNode(PxLoad3(upper), upper.w);
+
+		// write leaf key ranges
+		rangeLefts[index] = index;
+		rangeRights[index] = index;
+	}
+}
+
+extern "C" __global__ void bvhComputeTriangleBounds(const PxVec3* PX_RESTRICT vertices, const PxU32* PX_RESTRICT triangleIndices, PxU32 numTriangles,
+	float4* PX_RESTRICT itemLowers, float4* PX_RESTRICT itemUppers, PxReal margin)
+{
+	const PxI32 index = blockDim.x*blockIdx.x + threadIdx.x;
+
+	if (index < numTriangles)
+	{
+		PxVec3 a = vertices[triangleIndices[3 * index + 0]];
+		PxVec3 b = vertices[triangleIndices[3 * index + 1]];
+		PxVec3 c = vertices[triangleIndices[3 * index + 2]];
+
+		PxBounds3 bounds(a, a);
+		bounds.include(b);
+		bounds.include(c);
+		bounds.fattenFast(margin);
+
+		itemLowers[index] = make_float4(bounds.minimum.x, bounds.minimum.y, bounds.minimum.z, 0.0f);
+		itemUppers[index] = make_float4(bounds.maximum.x, bounds.maximum.y, bounds.maximum.z, 0.0f);
+
+		/*printf("%i   %f %f %f   %f %f %f\n", index, bounds.minimum.x, bounds.minimum.y, bounds.minimum.z, bounds.maximum.x, bounds.maximum.y, bounds.maximum.z);*/
+		//printf("%i   %i %i %i   %f %f %f\n", index, triangleIndices[3 * index + 0], triangleIndices[3 * index + 1], triangleIndices[3 * index + 2], a.x, a.y, a.z);
+	}
+}
+
+extern "C" __global__ void bvhBuildHierarchy(PxI32 n, PxI32* root, PxU32* maxTreeDepth, const PxReal* PX_RESTRICT deltas, PxI32* PX_RESTRICT numChildren,
+	volatile PxI32* PX_RESTRICT rangeLefts, volatile PxI32* PX_RESTRICT rangeRights, volatile PxgPackedNodeHalf* PX_RESTRICT lowers, volatile PxgPackedNodeHalf* PX_RESTRICT uppers)
+{
+	buildHierarchy(n, root, maxTreeDepth, deltas, numChildren, rangeLefts, rangeRights, lowers, uppers);
+}
+
+
+extern "C" __global__ void bvhBuildHierarchyAndWindingClusters(PxI32 n, PxI32* root, PxU32* maxTreeDepth, const PxReal* PX_RESTRICT deltas, PxI32* PX_RESTRICT numChildren,
+	volatile PxI32* PX_RESTRICT rangeLefts, volatile PxI32* PX_RESTRICT rangeRights, volatile PxgPackedNodeHalf* PX_RESTRICT lowers, volatile PxgPackedNodeHalf* PX_RESTRICT uppers,
+	PxgWindingClusterApproximation* clusters, const PxVec3* vertices, const PxU32* indices)
+{
+	WindingClusterBuilder w(clusters, vertices, indices, n);
+	buildHierarchy(n, root, maxTreeDepth, deltas, numChildren, rangeLefts, rangeRights, lowers, uppers, w);
+}
+
+PX_FORCE_INLINE __device__ void AtomicMaxVec3(PxVec3* address, const PxVec3 v)
+{
+	PxReal* arr = reinterpret_cast<PxReal*>(address);
+	AtomicMax(&arr[0], v.x);
+	AtomicMax(&arr[1], v.y);
+	AtomicMax(&arr[2], v.z);
+}
+
+PX_FORCE_INLINE __device__ void AtomicMinVec3(PxVec3* address, const PxVec3 v)
+{
+	PxReal* arr = reinterpret_cast<PxReal*>(address);
+	AtomicMin(&arr[0], v.x);
+	AtomicMin(&arr[1], v.y);
+	AtomicMin(&arr[2], v.z);
+}
+
+extern "C" __global__ void bvhComputeTotalBounds(const float4* itemLowers, const float4* itemUppers, PxVec3* totalLower, PxVec3* totalUpper, PxI32 numItems)
+{
+	const PxI32 blockStart = blockDim.x*blockIdx.x;
+	const PxI32 numValid = min(numItems - blockStart, blockDim.x);
+
+	const PxI32 tid = blockStart + threadIdx.x;
+
+	PxU32 mask = __ballot_sync(FULL_MASK, tid < numItems);
+	if (tid < numItems)
+	{
+		PxVec3 lower = PxLoad3(itemLowers[tid]);
+		PxVec3 upper = PxLoad3(itemUppers[tid]);
+
+		PxVec3 blockUpper;
+		blockUpper.x = blockReduction<MaxOpFloat, PxReal, 256>(mask, upper.x, -FLT_MAX);
+		__syncthreads();
+		blockUpper.y = blockReduction<MaxOpFloat, PxReal, 256>(mask, upper.y, -FLT_MAX);
+		__syncthreads();
+		blockUpper.z = blockReduction<MaxOpFloat, PxReal, 256>(mask, upper.z, -FLT_MAX);
+
+		// sync threads because second reduce uses same temp storage as first
+		__syncthreads();
+
+		PxVec3 blockLower;
+		blockLower.x = blockReduction<MinOpFloat, PxReal, 256>(mask, lower.x, FLT_MAX);
+		__syncthreads();
+		blockLower.y = blockReduction<MinOpFloat, PxReal, 256>(mask, lower.y, FLT_MAX);
+		__syncthreads();
+		blockLower.z = blockReduction<MinOpFloat, PxReal, 256>(mask, lower.z, FLT_MAX);
+
+		if (threadIdx.x == 0)
+		{
+			// write out block results, expanded by the radius
+			AtomicMaxVec3(totalUpper, blockUpper);
+			AtomicMinVec3(totalLower, blockLower);
+		}
+	}
+}
+
+// compute inverse edge length, this is just done on the GPU to avoid a CPU->GPU sync point
+extern "C" __global__ void bvhComputeTotalInvEdges(const PxVec3* totalLower, const PxVec3* totalUpper, PxVec3* totalInvEdges)
+{
+	PxVec3 edges = (totalUpper[0] - totalLower[0]);
+	edges += PxVec3(0.0001f);
+
+	totalInvEdges[0] = PxVec3(1.0f / edges.x, 1.0f / edges.y, 1.0f / edges.z);
+}
+
+PX_FORCE_INLINE __device__ PxReal getDistanceOffset(const PxgPackedNodeHalf& o)
+{
+	return reinterpret_cast<const float4&>(o).w;
+}
+
+PX_FORCE_INLINE __device__ void setDistanceOffset(PxgPackedNodeHalf& o, PxReal distanceOffset)
+{
+	reinterpret_cast<float4&>(o).w = distanceOffset;
+}
+
+//The point is encoded as the center of a leaf node bounding box
+struct ClosestDistanceToPointCloudTraversalWithOffset
+{
+public:
+	PxVec3 mQueryPoint;
+	PxReal mClosestDistance;
+
+	PX_FORCE_INLINE __device__ ClosestDistanceToPointCloudTraversalWithOffset()
+	{
+	}
+
+	PX_FORCE_INLINE __device__ ClosestDistanceToPointCloudTraversalWithOffset(const PxVec3& queryPoint, PxReal initialClosestDistance = 100000000000.0f)
+		: mQueryPoint(queryPoint), mClosestDistance(initialClosestDistance)
+	{
+	}
+
+	PX_FORCE_INLINE __device__ PxReal distancePointBoxSquared(const PxVec3& minimum, const PxVec3& maximum, const PxVec3& point)
+	{
+		PxVec3 closestPt = minimum.maximum(maximum.minimum(point));
+		return (closestPt - point).magnitudeSquared();
+	}	
+
+	PX_FORCE_INLINE __device__ BvhTraversalControl::Enum operator()(const PxgPackedNodeHalf& lower, const PxgPackedNodeHalf& upper, PxI32 nodeIndex)
+	{
+		if (distancePointBoxSquared(PxVec3(lower.x, lower.y, lower.z), PxVec3(upper.x, upper.y, upper.z), mQueryPoint) >= mClosestDistance * mClosestDistance)
+			return BvhTraversalControl::eDontGoDeeper;
+
+		if (lower.b)
+		{
+			const PxVec3 point = PxVec3(0.5f * (lower.x + upper.x), 0.5f * (lower.y + upper.y), 0.5f * (lower.z + upper.z));
+
+			PxReal distanceOffset = getDistanceOffset(upper);
+			PxReal distSq = (mQueryPoint - point).magnitudeSquared() + distanceOffset * distanceOffset;
+			if (distSq < mClosestDistance * mClosestDistance)
+			{
+				mClosestDistance = PxSqrt(distSq);
+			}
+
+			return BvhTraversalControl::eDontGoDeeper;
+		}
+
+		return BvhTraversalControl::eGoDeeper;
+	}
+};
+
+PX_FORCE_INLINE __device__ bool traceInteriorRay(const PxgBvhTriangleMesh& mesh, const PxVec3& origin, const PxVec3& dir, PxI32* stack, PxU32 stackSize, PxReal& closestDotProduct, bool& closestPointOnTriangleEdge)
+{
+	ClosestRayIntersectionTraversal query(mesh.mVertices, mesh.mTriangles, origin, dir, true);
+	queryBVH(mesh.mBvh, query, stack, stackSize);
+
+	closestDotProduct = query.closestDotProduct;
+	closestPointOnTriangleEdge = query.closestPointOnTriangleEdge;
+	return query.hasHit();
+}
+
+extern "C" __global__ __launch_bounds__(256, 1) void sdfCalculateDenseGridHybrid(PxgBvhTriangleMesh mesh, const PxgWindingClusterApproximation* PX_RESTRICT windingNumberClusters,
+	Gu::GridQueryPointSampler sampler, PxU32 sizeX, PxU32 sizeY, PxU32 sizeZ, PxReal* PX_RESTRICT sdfData)
+{
+	const PxU32 stackSize = 47;
+	//__shared__ PxI32 stackMem[256 * stackSize];
+	PxI32 stackMem[stackSize];
+
+	// block addressing
+	const PxI32 x = blockIdx.x*blockDim.x + threadIdx.x;
+	const PxI32 y = blockIdx.y*blockDim.y + threadIdx.y;
+	const PxI32 z = blockIdx.z*blockDim.z + threadIdx.z;
+
+	//const PxI32 threadId = threadIdx.z * 8 * 8 + threadIdx.y * 8 + threadIdx.x;
+
+	if (x < sizeX && y < sizeY && z < sizeZ)
+	{
+		PxU32 roodNodeId = *mesh.mBvh.mRootNode;
+		PxVec3 meshBoundsMin = mesh.mBvh.mNodeLowers[roodNodeId].getXYZ();
+		PxVec3 meshBoundsMax = mesh.mBvh.mNodeUppers[roodNodeId].getXYZ();
+
+		const PxVec3 p = sampler.getPoint(x, y, z);
+		
+		PxI32* stack = &stackMem[/*stackSize * threadId*/0];
+
+		ClosestDistanceToTriangleMeshTraversal distQuery(mesh.mTriangles, mesh.mVertices, p);
+		queryBVH(mesh.mBvh, distQuery, stack, stackSize);
+		PxReal d = PxSqrt(distQuery.mClosestDistanceSquared);
+		
+		PxReal sign = 1.0f;
+
+		bool repeatInsideTest = false;
+
+		PxI32 parity = 0;
+		PxReal threshold = 0.01f;		
+		PxReal closestDotProduct;
+		bool closestPointOnTriangleEdge;
+
+		// x-axis
+		if (traceInteriorRay(mesh, p, PxVec3(PxAbs(p.x - meshBoundsMin.x) < PxAbs(meshBoundsMax.x - p.x) ? -1.0f : 1.0f, 0.0f, 0.0f), stack, stackSize, closestDotProduct, closestPointOnTriangleEdge))
+		{
+			if (closestDotProduct < 0.0f)
+				parity++;
+			if (closestPointOnTriangleEdge || PxAbs(closestDotProduct) <= threshold)
+				repeatInsideTest = true;
+		}
+			
+		// y-axis
+		if (!repeatInsideTest && traceInteriorRay(mesh, p, PxVec3(0.0f, PxAbs(p.y - meshBoundsMin.y) < PxAbs(meshBoundsMax.y - p.y) ? -1.0f : 1.0f, 0.0f), stack, stackSize, closestDotProduct, closestPointOnTriangleEdge))
+		{
+			if (closestDotProduct < 0.0f)
+				parity++;
+			if (closestPointOnTriangleEdge || PxAbs(closestDotProduct) <= threshold)
+				repeatInsideTest = true;
+		}
+			
+		// z-axis
+		if (!repeatInsideTest && traceInteriorRay(mesh, p, PxVec3(0.0f, 0.0f, PxAbs(p.z - meshBoundsMin.z) < PxAbs(meshBoundsMax.z - p.z) ? -1.0f : 1.0f), stack, stackSize, closestDotProduct, closestPointOnTriangleEdge))
+		{
+			if (closestDotProduct < 0.0f)
+				parity++;
+			if (closestPointOnTriangleEdge || PxAbs(closestDotProduct) <= threshold)
+				repeatInsideTest = true;
+		}
+
+		if (parity == 3)
+			sign = -1.0f;
+		else if (parity != 0)
+		{
+			repeatInsideTest = true;				
+		}
+
+		if (repeatInsideTest)
+		{
+			//Fall back to winding numbers for problematic points
+			WindingNumberTraversal windingNumber(mesh.mTriangles, mesh.mNumTriangles, mesh.mVertices, windingNumberClusters, p);
+			queryBVH(mesh.mBvh, windingNumber, stack, stackSize);
+			bool inside = windingNumber.mWindingNumber > 0.5f;
+			if (inside)
+				sign = -1.0f;
+		}			
+
+		sdfData[Gu::idx3D(x, y, z, sizeX, sizeY)] = d * sign;
+	}
+	__syncthreads();
+}
+
+extern "C" __global__ __launch_bounds__(256, 1) void sdfCalculateDenseGridBlocks(PxgBvhTriangleMesh mesh, const PxgWindingClusterApproximation* PX_RESTRICT windingNumberClusters,
+	Gu::GridQueryPointSampler sampler, PxU32 sizeX, PxU32 sizeY, PxU32 sizeZ, PxReal* PX_RESTRICT sdfData, PxReal* PX_RESTRICT windingNumbers)
+{
+	const PxU32 stackSize = 47;
+	//__shared__ PxI32 stackMem[256 * stackSize];
+	PxI32 stackMem[stackSize];
+
+	const PxU32 x = blockIdx.x*blockDim.x + threadIdx.x;
+	const PxU32 y = blockIdx.y*blockDim.y + threadIdx.y;
+	const PxU32 z = blockIdx.z*blockDim.z + threadIdx.z;
+
+	//const PxI32 threadId = threadIdx.z * 8 * 8 + threadIdx.y * 8 + threadIdx.x;
+
+	if (x < sizeX && y < sizeY && z < sizeZ)
+	{
+		PxVec3 p = sampler.getPoint(x, y, z);
+
+		PxI32* stack = &stackMem[/*stackSize * threadId*/0];
+		
+		ClosestDistanceToTriangleMeshTraversal distQuery(mesh.mTriangles, mesh.mVertices, p);
+		queryBVH(mesh.mBvh, distQuery, stack, stackSize);
+		PxReal closestDistance = PxSqrt(distQuery.mClosestDistanceSquared);
+
+		WindingNumberTraversal windingNumber(mesh.mTriangles, mesh.mNumTriangles, mesh.mVertices, windingNumberClusters, p);
+		queryBVH(mesh.mBvh, windingNumber, stack, stackSize);
+
+		
+		PxU32 resultIndex = Gu::idx3D(x, y, z, sizeX, sizeY);
+
+		bool inside = windingNumber.mWindingNumber > 0.5f;
+		sdfData[resultIndex] = (inside ? -1.0f : 1.0f) * closestDistance;
+
+		if (windingNumbers)
+			windingNumbers[resultIndex] = windingNumber.mWindingNumber;
+	}
+}
+
+PX_FORCE_INLINE PX_CUDA_CALLABLE PxU32 pow3(PxU32 i)
+{
+	return i * i * i;
+}
+
+PX_FORCE_INLINE PX_CUDA_CALLABLE bool rangesOverlaps(PxReal minA, PxReal maxA, PxReal minB, PxReal maxB)
+{
+	return  !(minA > maxB || minB > maxA);
+}
+
+extern "C" __global__ void sdfPopulateBackgroundSDF(PxU32 cellsPerSubgrid, PxReal* PX_RESTRICT backgroundSDF, PxU32 backgroundSizeX, PxU32 backgroundSizeY, PxU32 backgroundSizeZ,
+	const PxReal* PX_RESTRICT sdf, PxU32 width, PxU32 height, PxU32 depth)
+{
+	PxI32 id = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+	if (id < backgroundSizeX * backgroundSizeY * backgroundSizeZ)
+	{
+		PxU32 xBlock, yBlock, zBlock;
+		Gu::idToXYZ(id, backgroundSizeX, backgroundSizeY, xBlock, yBlock, zBlock);
+
+		const PxU32 index = Gu::idx3D(xBlock * cellsPerSubgrid, yBlock * cellsPerSubgrid, zBlock * cellsPerSubgrid, width + 1, height + 1);
+		backgroundSDF[id] = sdf[index];
+	}
+}
+
+
+extern "C" __global__ void
+__launch_bounds__(PxgBVHKernelBlockDim::BUILD_SDF, 1)
+sdfMarkRequiredSdfSubgrids(PxReal* PX_RESTRICT backgroundSDF, const PxReal* PX_RESTRICT sdf, PxU32* PX_RESTRICT subgridInfo, PxU8* PX_RESTRICT subgridActive, PxU32 cellsPerSubgrid, PxU32 width, PxU32 height, PxU32 depth,
+	PxU32 backgroundSizeX, PxU32 backgroundSizeY, PxU32 backgroundSizeZ, PxReal narrowBandThickness, PxReal* subgridGlobalMinValue, PxReal* subgridGlobalMaxValue, PxReal errorThreshold)
+{
+	__shared__ PxReal sharedMemoryX[PxgBVHKernelBlockDim::BUILD_SDF / WARP_SIZE];
+	__shared__ PxReal sharedMemoryY[PxgBVHKernelBlockDim::BUILD_SDF / WARP_SIZE];
+	__shared__ PxReal sharedMemoryZ[PxgBVHKernelBlockDim::BUILD_SDF / WARP_SIZE];
+
+	Gu::DenseSDF coarseEval(backgroundSizeX, backgroundSizeY, backgroundSizeZ, backgroundSDF); //TODO: Replace with 3d texture?
+	PxReal s = 1.0f / cellsPerSubgrid;
+
+	//A subgrid has pow3(cellsPerSubgrid) cells but pow3(cellsPerSubgrid + 1) samples
+	PxU32 numSamplesPerSubgrid = pow3(cellsPerSubgrid + 1);
+
+	PxReal sdfMin = FLT_MAX;
+	PxReal sdfMax = -FLT_MAX;
+	PxReal maxAbsError = 0.0f;
+
+	for (PxU32 i = threadIdx.x; i < numSamplesPerSubgrid; i += blockDim.x)
+	{
+		PxU32 xLocal, yLocal, zLocal;
+		Gu::idToXYZ(i, cellsPerSubgrid + 1, cellsPerSubgrid + 1, xLocal, yLocal, zLocal);
+
+		PxU32 x = blockIdx.x * cellsPerSubgrid + xLocal;
+		PxU32 y = blockIdx.y * cellsPerSubgrid + yLocal;
+		PxU32 z = blockIdx.z * cellsPerSubgrid + zLocal;
+
+		const PxU32 index = Gu::idx3D(x, y, z, width + 1, height + 1);
+		PxReal sdfValue = sdf[index];
+
+		sdfMin = PxMin(sdfMin, sdfValue);
+		sdfMax = PxMax(sdfMax, sdfValue);
+
+		maxAbsError = PxMax(maxAbsError, PxAbs(sdfValue - coarseEval.sampleSDFDirect(PxVec3(blockIdx.x + xLocal * s, blockIdx.y + yLocal * s, blockIdx.z + zLocal * s))));
+	}
+
+	__syncthreads();
+
+	sdfMin = blockReduction<MinOpFloat, PxReal>(FULL_MASK, sdfMin, FLT_MAX, blockDim.x, sharedMemoryX);
+	sdfMax = blockReduction<MaxOpFloat, PxReal>(FULL_MASK, sdfMax, -FLT_MAX, blockDim.x, sharedMemoryY);
+	maxAbsError = blockReduction<MaxOpFloat, PxReal>(FULL_MASK, maxAbsError, -FLT_MAX, blockDim.x, sharedMemoryZ);
+
+	__syncthreads();
+
+	if (threadIdx.x == 0)
+	{		
+		bool subgridRequired = rangesOverlaps(sdfMin, sdfMax, -narrowBandThickness, narrowBandThickness);
+		if (maxAbsError < errorThreshold)
+			subgridRequired = false; //No need for a subgrid if the coarse SDF is already almost exact
+
+		PxU32 index = Gu::idx3D(blockIdx.x, blockIdx.y, blockIdx.z, backgroundSizeX - 1, backgroundSizeY - 1);
+
+		if (subgridRequired)
+		{
+			AtomicMin(subgridGlobalMinValue, sdfMin);
+			AtomicMax(subgridGlobalMaxValue, sdfMax);
+			
+			subgridInfo[index] = 1;
+			subgridActive[index] = 1;
+		}	
+		else
+		{
+			subgridInfo[index] = 0;
+			subgridActive[index] = 0;
+		}
+	}
+}
+
+PX_FORCE_INLINE __device__ void storeQuantized(void* PX_RESTRICT destination, PxU32 index, PxReal vNormalized, PxU32 bytesPerSubgridPixel)
+{
+	switch (bytesPerSubgridPixel)
+	{
+	case 1:
+	{
+		PxU8* ptr8 = reinterpret_cast<PxU8*>(destination);
+		ptr8[index] = PxU8(255.0f * PxClamp(vNormalized, 0.0f, 1.0f));
+	}
+		break;
+	case 2:
+	{
+		PxU16* ptr16 = reinterpret_cast<PxU16*>(destination);
+		ptr16[index] = PxU16(65535.0f * PxClamp(vNormalized, 0.0f, 1.0f));
+	}
+		break;
+	default:
+		assert(0);
+		break;	
+	}
+}
+
+extern "C" __global__ 
+__launch_bounds__(PxgBVHKernelBlockDim::BUILD_SDF, 1)
+void sdfPopulateSdfSubgrids(const PxReal* PX_RESTRICT denseSDF, PxU32 width, PxU32 height, PxU32 depth, PxU32* PX_RESTRICT subgridInfo, PxU8* PX_RESTRICT subgridActive, PxU32 subgridSize, PxU32 w, PxU32 h, PxU32 d,
+	void* PX_RESTRICT quantizedSparseSDFIn3DTextureFormat, PxU32 numSubgridsX, PxU32 numSubgridsY, PxU32 numSubgridsZ, const PxReal* subgridsMinSdfValue,
+	const PxReal* subgridsMaxSdfValue, PxU32 bytesPerSubgridPixel, PxU32 outputSize)
+{
+	const PxU32 idx = Gu::idx3D(blockIdx.x, blockIdx.y, blockIdx.z, w, h);
+	//if (idx >= w*h*d)
+	//	printf("out of range 1\n");
+
+	const PxU32 addressInfo = subgridInfo[idx];
+
+	__syncthreads(); //Make sure that all threads in thread block have read the addressInfo
+
+	if (subgridActive[idx] == 0) 
+	{
+		subgridInfo[idx] = 0xFFFFFFFF;
+		return; //Subgrid does not need to be created
+	}
+
+	//if (addressInfo == 0xFFFFFFFF)
+	//	printf("address %i   %i      %i %i\n", addressInfo, PxU32(activeSubgrids[idx]), w, h);
+
+	PxU32 addressX, addressY, addressZ;
+	Gu::idToXYZ(addressInfo, numSubgridsX, numSubgridsY, addressX, addressY, addressZ);
+
+	if (threadIdx.x == 0)
+		subgridInfo[idx] = Gu::encodeTriple(addressX, addressY, addressZ);
+
+	//if (addressX >= numSubgridsX || addressY >= numSubgridsY || addressZ >= numSubgridsZ)
+	//	printf("kernel, subgrid index out of bounds %i %i %i %i\n", addressX, addressY, addressZ, addressInfo);
+
+	addressX *= (subgridSize + 1);
+	addressY *= (subgridSize + 1);
+	addressZ *= (subgridSize + 1);
+
+	//A subgrid has pow3(subgridSize) cells but pow3(subgridSize + 1) samples
+	PxU32 numSamplesPerSubgrid = pow3(subgridSize + 1);
+
+	PxU32 tex3DsizeX = numSubgridsX * (subgridSize + 1);
+	PxU32 tex3DsizeY = numSubgridsY * (subgridSize + 1);
+	//PxU32 tex3DsizeZ = numSubgridsZ * (subgridSize + 1);
+
+	for (PxU32 i = threadIdx.x; i < numSamplesPerSubgrid; i += blockDim.x)
+	{
+		PxU32 xLocal, yLocal, zLocal;
+		Gu::idToXYZ(i, subgridSize + 1, subgridSize + 1, xLocal, yLocal, zLocal);
+
+		const PxU32 index = Gu::idx3D(
+			blockIdx.x * subgridSize + xLocal, 
+			blockIdx.y * subgridSize + yLocal,
+			blockIdx.z * subgridSize + zLocal,
+			width + 1, height + 1);
+	
+		/*if(index >= (width+1)*(height + 1)*(depth + 1))
+			printf("out of range 2\n");*/
+
+		PxReal sdfValue = denseSDF[index];	
+		PxU32 outputIndex = Gu::idx3D(addressX + xLocal, addressY + yLocal, addressZ + zLocal, tex3DsizeX, tex3DsizeY);
+
+		if (outputIndex * bytesPerSubgridPixel < outputSize)
+		{
+			if (bytesPerSubgridPixel == 4)
+			{
+				PxReal* ptr32 = reinterpret_cast<PxReal*>(quantizedSparseSDFIn3DTextureFormat);
+				ptr32[outputIndex] = sdfValue;
+			}
+			else
+			{
+				PxReal s = 1.0f / (subgridsMaxSdfValue[0] - subgridsMinSdfValue[0]);
+				PxReal vNormalized = (sdfValue - subgridsMinSdfValue[0]) * s;
+				storeQuantized(quantizedSparseSDFIn3DTextureFormat, outputIndex, vNormalized, bytesPerSubgridPixel);
+			}
+		}
+		/*else 
+		{
+			printf("out of range %i %i   %i %i   %i %i          %i  %i    %i %i\n", addressX, xLocal, addressY, yLocal, addressZ, zLocal, bytesPerSubgridPixel, outputIndex, addressInfo, PxU32(activeSubgrids[idx]));
+		}*/
+	}
+	//__syncthreads();
+}
+
+__device__ void findHoles(const PxReal* PX_RESTRICT sdf, const PxU32 width, const PxU32 height, const PxU32 depth, const PxVec3 cellSize,
+	PxU32* atomicCounter, const Gu::GridQueryPointSampler* sampler, float4* PX_RESTRICT itemLowers, float4* PX_RESTRICT itemUppers, PxU32 capacity)
+{
+	PxI32 id = ((blockIdx.x * blockDim.x) + threadIdx.x);
+
+	bool valueChanged = false;
+	PxReal newValue = 0.0f;
+	PxU32 px, py, pz;
+
+	if (id < width * height * depth)
+	{
+		PxReal initialValue = sdf[id];
+		newValue = PxAbs(initialValue);
+
+		Gu::idToXYZ(id, width, height, px, py, pz);
+
+		for (PxU32 z = PxMax(1u, pz) - 1; z <= PxMin(depth - 1, pz + 1); ++z)
+			for (PxU32 y = PxMax(1u, py) - 1; y <= PxMin(height - 1, py + 1); ++y)
+				for (PxU32 x = PxMax(1u, px) - 1; x <= PxMin(width - 1, px + 1); ++x)
+				{
+					if (x == px && y == py && z == pz)
+						continue;
+
+					PxU32 index = Gu::idx3D(x, y, z, width, height);
+					if (index >= width * height * depth)
+						continue;
+
+					PxReal value = sdf[index];
+
+					if (PxSign(initialValue) != PxSign(value))
+					{
+						PxReal distance = 0;
+						if (x != px)
+							distance += cellSize.x * cellSize.x;
+						if (y != py)
+							distance += cellSize.y * cellSize.y;
+						if (z != pz)
+							distance += cellSize.z * cellSize.z;
+
+						distance = PxSqrt(distance);
+
+						PxReal delta = PxAbs(value - initialValue);
+
+						if (0.99f * delta > distance)
+						{
+							PxReal scaling = distance / delta;
+							PxReal v = 0.99f * scaling * initialValue;
+							newValue = PxMin(newValue, PxAbs(v));
+						}
+					}
+				}
+
+		if (initialValue < 0)
+			newValue = -newValue;
+
+		valueChanged = newValue != initialValue;
+	}
+
+	PxU32 outputIdx = globalScanExclusive<PxgBVHKernelBlockDim::SDF_FIX_HOLES / WARP_SIZE>(valueChanged, atomicCounter);
+
+	if (valueChanged && itemLowers && itemUppers)
+	{
+		const PxVec3 p = sampler->getPoint(px, py, pz);
+
+		assert(outputIdx < capacity);
+
+		itemLowers[outputIdx] = make_float4(p.x, p.y, p.z, 0.0f);
+		itemUppers[outputIdx] = make_float4(px, py, pz, newValue);
+	}
+}
+
+extern "C" __global__
+__launch_bounds__(PxgBVHKernelBlockDim::SDF_FIX_HOLES, 1)
+void sdfCountHoles(const PxReal* PX_RESTRICT sdf, const PxU32 width, const PxU32 height, const PxU32 depth, const PxVec3 cellSize,
+	PxU32* atomicCounter)
+{
+	findHoles(sdf, width, height, depth, cellSize, atomicCounter, NULL, NULL, NULL, 0);
+}
+
+
+//If the triangle, mesh which is used to compute the SDF, has a hole, then the sdf values near a sign change will not satisfy the eikonal equation
+//This kernel fixes those jumps along sign changes
+//Afterwards a jump flood algorithm can be used to fix the vincinity of the signe change
+extern "C" __global__ 
+__launch_bounds__(PxgBVHKernelBlockDim::SDF_FIX_HOLES, 1)
+void sdfFindHoles(const PxReal* PX_RESTRICT sdf, const PxU32 width, const PxU32 height, const PxU32 depth, const PxVec3 cellSize,
+	PxU32* atomicCounter, const Gu::GridQueryPointSampler sampler,
+	float4* PX_RESTRICT itemLowers, float4* PX_RESTRICT itemUppers, PxU32 capacity)
+{
+	findHoles(sdf, width, height, depth, cellSize, atomicCounter, &sampler, itemLowers, itemUppers, capacity);
+}
+
+extern "C" __global__ void sdfApplyHoleCorrections(PxReal* PX_RESTRICT sdf, PxU32 width, PxU32 height, PxU32 depth, 
+	Gu::GridQueryPointSampler sampler,
+	PxVec4* PX_RESTRICT itemUppers, PxU32 numCorrections)
+{
+	PxI32 id = ((blockIdx.x * blockDim.x) + threadIdx.x);
+
+	if (id < numCorrections)
+	{
+		PxVec4 upper = itemUppers[id];
+		PxU32 x = PxU32(upper.x);
+		PxU32 y = PxU32(upper.y);
+		PxU32 z = PxU32(upper.z);
+
+		const PxVec3 p = sampler.getPoint(x, y, z);
+
+		sdf[Gu::idx3D(x, y, z, width, height)] = upper.w;
+
+		itemUppers[id] = PxVec4(p.x, p.y, p.z, upper.w);
+	}
+}
+
+//This can be launched on an existing SDF to fix distances given a point cloud where every leaf node was corrected due to a sign change in the SDF causing a gap in distance values larger than the cell size.
+//These kind of gaps can occur at places where the input triangle mesh has holes. Watertight meshes don't need this kind of post process repair.
+//The fast marching method or jump flood could be used as well to fix those defects but they need either many kernel launches or much more memory compared to the point cloud tree.
+extern "C" __global__ __launch_bounds__(256, 1) void sdfCalculateDenseGridPointCloud(PxgBVH bvh, 
+	Gu::GridQueryPointSampler sampler, PxU32 sizeX, PxU32 sizeY, PxU32 sizeZ, PxReal* PX_RESTRICT sdfData)
+{
+	const PxU32 stackSize = 47;
+	//__shared__ PxI32 stackMem[256 * stackSize];
+	PxI32 stackMem[stackSize];
+
+	// block addressing
+	const PxI32 x = blockIdx.x*blockDim.x + threadIdx.x;
+	const PxI32 y = blockIdx.y*blockDim.y + threadIdx.y;
+	const PxI32 z = blockIdx.z*blockDim.z + threadIdx.z;
+
+	//const PxI32 threadId = threadIdx.z * 8 * 8 + threadIdx.y * 8 + threadIdx.x;
+
+	if (x < sizeX && y < sizeY && z < sizeZ)
+	{
+		const PxReal prevSdfValue = sdfData[Gu::idx3D(x, y, z, sizeX, sizeY)];
+
+		const PxVec3 p = sampler.getPoint(x, y, z); 
+
+		PxI32* stack = &stackMem[/*stackSize * threadId*/0];
+
+		ClosestDistanceToPointCloudTraversalWithOffset distQuery(p, PxAbs(prevSdfValue));
+		queryBVH(bvh, distQuery, stack, stackSize);
+		
+		PxReal d = distQuery.mClosestDistance;
+		if (d < PxAbs(prevSdfValue))
+		{
+			if (prevSdfValue < 0.0f)
+				d = -d;
+
+			sdfData[Gu::idx3D(x, y, z, sizeX, sizeY)] = d;
+		}
+	}
+}
--- a/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/algorithms.cu
+++ b/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/algorithms.cu
@@ -0,0 +1,438 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "foundation/PxSimpleTypes.h"
+#include "foundation/PxVec3.h"
+#include "foundation/PxVec4.h"
+#include "cuda.h"
+#include "PxgAlgorithmsData.h"
+#include "stdio.h"
+
+using namespace physx;
+
+extern "C" __host__ void initAlgorithmsKernels0() {}
+
+struct int4x4
+{
+	int4 data[4];
+};
+
+PX_FORCE_INLINE PX_CUDA_CALLABLE int4x4 make_int16(const int4& a, const int4& b, const int4& c, const int4& d)
+{
+	int4x4 result;
+	result.data[0] = a;
+	result.data[1] = b;
+	result.data[2] = c;
+	result.data[3] = d;
+	return result;
+}
+
+PX_FORCE_INLINE PX_CUDA_CALLABLE int4 operator+(const int4& lhs, const int4& rhs) { return make_int4(lhs.x + rhs.x, lhs.y + rhs.y, lhs.z + rhs.z, lhs.w + rhs.w); }
+
+PX_FORCE_INLINE PX_CUDA_CALLABLE int4x4 operator+(const int4x4& lhs, const int4x4& rhs) { return make_int16(lhs.data[0] + rhs.data[0], lhs.data[1] + rhs.data[1], lhs.data[2] + rhs.data[2], lhs.data[3] + rhs.data[3]); }
+
+PX_FORCE_INLINE __device__ int4 shfl_up_sync(PxU32 mask, int4 var, PxU32 delta, int width)
+{
+	return make_int4(__shfl_up_sync(mask, var.x, delta, width), __shfl_up_sync(mask, var.y, delta, width), __shfl_up_sync(mask, var.z, delta, width), __shfl_up_sync(mask, var.w, delta, width));
+}
+
+PX_FORCE_INLINE __device__ int4x4 shfl_up_sync(PxU32 mask, int4x4 var, PxU32 delta, int width)
+{
+	return make_int16(shfl_up_sync(mask, var.data[0], delta, width), shfl_up_sync(mask, var.data[1], delta, width), shfl_up_sync(mask, var.data[2], delta, width), shfl_up_sync(mask, var.data[3], delta, width));
+}
+
+PX_FORCE_INLINE __device__ PxU32 shfl_up_sync(PxU32 mask, PxU32 var, PxU32 delta, int width)
+{
+	return __shfl_up_sync(mask, var, delta, width);
+}
+
+PX_FORCE_INLINE __device__ PxI32 shfl_up_sync(PxU32 mask, PxI32 var, PxU32 delta, int width)
+{
+	return __shfl_up_sync(mask, var, delta, width);
+}
+
+PX_FORCE_INLINE __device__ PxU64 shfl_up_sync(PxU32 mask, PxU64 var, PxU32 delta, int width)
+{
+	return __shfl_up_sync(mask, var, delta, width);
+}
+
+template<typename T>
+PX_FORCE_INLINE __device__ T zero()
+{
+	return T();
+}
+
+template<>
+PX_FORCE_INLINE __device__ PxU32 zero<PxU32>()
+{
+	return 0;
+}
+
+template<>
+PX_FORCE_INLINE __device__ int4 zero<int4>()
+{
+	return make_int4(0, 0, 0, 0);
+}
+
+template<>
+PX_FORCE_INLINE __device__ int4x4 zero<int4x4>()
+{
+	return make_int16(make_int4(0, 0, 0, 0), make_int4(0, 0, 0, 0), make_int4(0, 0, 0, 0), make_int4(0, 0, 0, 0));
+}
+
+template<typename T>
+__device__ void warpScan(T& value, const int lane_id)
+{
+#pragma unroll
+	for (int i = 1; i <= 32; i *= 2)
+	{
+		unsigned int mask = 0xffffffff;
+		T n = shfl_up_sync(mask, value, i, 32);
+
+		if (lane_id >= i)
+			value = value + n;
+	}
+}
+
+template<typename T>
+__device__ T scanPerBlock(
+	T				value,
+	const PxU32		id,
+	T*				sum)
+{
+	extern __shared__ PxU32 sumsMemory[];
+	T* sums = reinterpret_cast<T*>(sumsMemory);
+	int lane_id = id % warpSize;
+	// determine a warp_id within a block
+	int warp_id = threadIdx.x / warpSize;
+
+	// Now accumulate in log steps up the chain
+	// compute sums, with another thread's value who is
+	// distance delta away (i).  Note
+	// those threads where the thread 'i' away would have
+	// been out of bounds of the warp are unaffected.  This
+	// creates the scan sum.
+
+	warpScan(value, lane_id);
+
+	// value now holds the scan value for the individual thread
+	// next sum the largest values for each warp
+
+	__syncthreads(); //Required before accessing shared memory because this function can be called inside loops
+
+	// write the sum of the warp to smem
+	if (threadIdx.x % warpSize == warpSize - 1)
+	{
+		sums[warp_id] = value;
+	}
+
+	__syncthreads();
+
+	//
+	// scan sum the warp sums
+	// the same shfl scan operation, but performed on warp sums
+	//
+	if (warp_id == 0 && lane_id < (blockDim.x / warpSize))
+	{
+		T warp_sum = sums[lane_id];
+
+		int mask = (1 << (blockDim.x / warpSize)) - 1;
+		for (int i = 1; i <= (blockDim.x / warpSize); i *= 2)
+		{
+			T n = shfl_up_sync(mask, warp_sum, i, (blockDim.x / warpSize));
+
+			if (lane_id >= i)
+				warp_sum = warp_sum + n;
+		}
+
+		sums[lane_id] = warp_sum;
+	}
+
+	__syncthreads();
+
+	// perform a uniform add across warps in the block
+	// read neighbouring warp's sum and add it to threads value
+	T blockSum = zero<T>();
+
+	if (warp_id > 0)
+	{
+		blockSum = sums[warp_id - 1];
+	}
+
+	value = value + blockSum;
+
+	// last thread has sum, write write out the block's sum
+	if (sum != NULL && threadIdx.x == blockDim.x - 1)
+		*sum = value;
+
+	return value;
+}
+
+template<typename T>
+__device__ void scanPerBlockKernelShared(
+	int				id,
+	const T*		data,
+	T*				result,
+	T*				partialSums,
+	const PxU32		length,
+	const PxU32		exclusiveScan,
+	T*				totalSum)
+{
+	T value = id < length ? data[id] : zero<T>();
+	value = scanPerBlock(value, id, &partialSums[blockIdx.x]);
+	if (totalSum && id == length - 1)
+		*totalSum = value;
+
+	// Now write out our result
+	if (id < length && result)
+	{
+		if (exclusiveScan == 0)
+			result[id] = value;
+		else
+		{
+			if (threadIdx.x + 1 < blockDim.x && id + 1 < length)
+				result[id + 1] = value;
+			if (threadIdx.x == 0)
+				result[id] = zero<T>();
+		}
+	}
+}
+
+extern "C" __global__ __launch_bounds__(1024, 1) void scanPerBlockKernel(
+	const PxU32*	data,
+	PxU32*			result,
+	PxU32*			partialSums,
+	const PxU32		length,
+	const PxU32		exclusiveScan,
+	PxU32*			totalSum)
+{
+	scanPerBlockKernelShared<PxU32>((blockIdx.x * blockDim.x) + threadIdx.x, data, result, partialSums, length, exclusiveScan, totalSum);
+}
+
+__device__ void exclusiveSumInt16(int4x4* values)
+{
+	__syncthreads();
+	PxU32* ptr = reinterpret_cast<PxU32*>(values);
+	PxU32 value = threadIdx.x < 16 ? ptr[threadIdx.x] : 0;
+	warpScan(value, threadIdx.x % warpSize);
+	__syncthreads();
+	if (threadIdx.x < 15)
+		ptr[threadIdx.x + 1] = value;
+	if (threadIdx.x == 15)
+		ptr[0] = 0;
+}
+
+extern "C" __global__ __launch_bounds__(512, 1) void scanPerBlockKernel4x4(
+	const int4x4*	data,
+	int4x4*			result,
+	int4x4*			partialSums,
+	const PxU32		length,
+	const PxU32		exclusiveScan,
+	int4x4*			totalSum)
+{
+	int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
+	int4x4* r = NULL;
+	if (id < length && result)
+	{
+		if (exclusiveScan == 0)
+			r = &result[id];
+		else
+		{
+			if (threadIdx.x + 1 < blockDim.x && id + 1 < length)
+				r = &result[id + 1];
+			if (threadIdx.x == 0)
+			{
+				result[id].data[0] = zero<int4>();
+				result[id].data[1] = zero<int4>();
+				result[id].data[2] = zero<int4>();
+				result[id].data[3] = zero<int4>();
+			}
+		}
+	}
+
+	int4 value;
+#pragma unroll
+	for (PxI32 i = 0; i < 4; ++i) 
+	{
+		value = id < length ? data[id].data[i] : zero<int4>();
+		value = scanPerBlock(value, id, &partialSums[blockIdx.x].data[i]);
+		if (r)
+			r->data[i] = value;
+		if (totalSum && id == length - 1)
+			totalSum->data[i] = value;
+	}
+
+	if (totalSum && gridDim.x == 1)
+	{
+		exclusiveSumInt16(totalSum);
+	}
+}
+
+
+template<typename T>
+__device__ void addBlockSumsKernelShared(const T* partialSums, T* data, const PxU32 len, T* totalSum)
+{
+	const int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
+
+	if (id >= len)
+		return;
+
+	if (totalSum && id == len - 1)
+		*totalSum = *totalSum + partialSums[blockIdx.x];
+
+	if (data)
+		data[id] = data[id] + partialSums[blockIdx.x];
+}
+
+extern "C" __global__ __launch_bounds__(1024, 1) void addBlockSumsKernel(const PxU32* partialSums, PxU32* data, const PxU32 length, PxU32* totalSum)
+{
+	addBlockSumsKernelShared<PxU32>(partialSums, data, length, totalSum);
+}
+
+extern "C" __global__ __launch_bounds__(1024, 1) void addBlockSumsKernel4x4(const int4x4* partialSums, int4x4* data, const PxU32 len, int4x4* totalSum)
+{
+	const int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
+
+	if (totalSum && id == len - 1)
+	{
+		(*totalSum).data[0] = (*totalSum).data[0] + partialSums[blockIdx.x].data[0];
+		(*totalSum).data[1] = (*totalSum).data[1] + partialSums[blockIdx.x].data[1];
+		(*totalSum).data[2] = (*totalSum).data[2] + partialSums[blockIdx.x].data[2];
+		(*totalSum).data[3] = (*totalSum).data[3] + partialSums[blockIdx.x].data[3];
+	}
+
+	if (data && id < len)
+	{
+		data[id].data[0] = data[id].data[0] + partialSums[blockIdx.x].data[0];
+		data[id].data[1] = data[id].data[1] + partialSums[blockIdx.x].data[1];
+		data[id].data[2] = data[id].data[2] + partialSums[blockIdx.x].data[2];
+		data[id].data[3] = data[id].data[3] + partialSums[blockIdx.x].data[3];
+	}
+
+	if (totalSum && blockIdx.x == gridDim.x - 1)
+	{
+		exclusiveSumInt16(totalSum);
+	}
+}
+
+template<typename T>
+__device__ void radixFourBitCountPerBlock(const T* data, PxU16* offsetsPerWarp, PxU32 passIndex, int4x4* partialSums, const PxU32 length, int4x4* totalSum)
+{
+	int* totalSum1 = reinterpret_cast<int*>(totalSum);
+	int* partialSums1 = reinterpret_cast<int*>(&partialSums[blockIdx.x]);
+
+	int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
+	int slot = 0;
+	if (id < length)
+		slot = (data[id] >> (passIndex * 4)) & 15;
+
+	PxU64 value;
+	PxU64 partial;
+#pragma unroll
+	for (int i = 0; i < 3; ++i)
+	{
+		value = 0;
+		if (id < length && slot < 6 && slot >= 0)
+		{
+			value = ((PxU64)1) << (slot * 10);
+		}
+
+		value = scanPerBlock<PxU64>(value, id, &partial);
+		if (threadIdx.x == blockDim.x - 1)
+		{
+			partialSums1[6 * i] = partial & 0x000003FF;
+			partialSums1[6 * i + 1] = (partial >> 10) & 0x000003FF;
+			partialSums1[6 * i + 2] = (partial >> 20) & 0x000003FF;
+			partialSums1[6 * i + 3] = (partial >> 30) & 0x000003FF;
+			if (i < 2)
+			{
+				partialSums1[6 * i + 4] = (partial >> 40) & 0x000003FF;
+				partialSums1[6 * i + 5] = (partial >> 50) & 0x000003FF;
+			}
+		}
+
+		if (totalSum && id == length - 1)
+		{
+			totalSum1[6 * i] = value & 0x000003FF;
+			totalSum1[6 * i + 1] = (value >> 10) & 0x000003FF;
+			totalSum1[6 * i + 2] = (value >> 20) & 0x000003FF;
+			totalSum1[6 * i + 3] = (value >> 30) & 0x000003FF;
+			if (i < 2)
+			{
+				totalSum1[6 * i + 4] = (value >> 40) & 0x000003FF;
+				totalSum1[6 * i + 5] = (value >> 50) & 0x000003FF;
+			}
+		}
+
+		if (id < length && slot < 6 && slot >= 0)
+			offsetsPerWarp[id] = ((value >> (slot * 10)) & 0x000003FF) - 1;
+		slot -= 6;
+	}
+}
+
+extern "C" __global__ __launch_bounds__(512, 1) void radixFourBitCountPerBlockKernel(const PxU32* data, PxU16* offsetsPerWarp, PxU32 passIndex, int4x4* partialSums, const PxU32 length, int4x4* totalSum)
+{
+	radixFourBitCountPerBlock<PxU32>(data, offsetsPerWarp, passIndex, partialSums, length, totalSum);
+}
+
+template<typename T, typename U>
+__device__ void radixFourBitReorder(const T* data, const PxU16* offsetsPerWarp, T* reordered, PxU32 passIndex, int4x4* partialSums, const PxU32 length, int4x4* cumulativeSum, U* dependentData = NULL, U* dependentDataReordered = NULL)
+{
+	int* partialSums1 = reinterpret_cast<int*>(partialSums);
+
+	int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
+	if (id >= length)
+		return;
+	int* ptr = reinterpret_cast<int*>(cumulativeSum);
+	int slot = (data[id] >> (passIndex * 4)) & 15;
+
+	int newIndex = ptr[slot] + offsetsPerWarp[id] + partialSums1[16 * blockIdx.x + slot];
+
+	if (newIndex < length) //This condition should always be met but in case everything goes wrong, it ensures that no out of bounds access happens
+	{
+		reordered[newIndex] = data[id];
+
+		if (dependentData && dependentDataReordered)
+			dependentDataReordered[newIndex] = passIndex == 0 ? id : dependentData[id];
+	}
+}
+
+extern "C" __global__ __launch_bounds__(1024, 1) void radixFourBitReorderKernel(const PxU32* data, const PxU16* offsetsPerWarp, PxU32* reordered, PxU32 passIndex, int4x4* partialSums, const PxU32 length, int4x4* cumulativeSum, PxU32* dependentData, PxU32* dependentDataReordered)
+{
+	radixFourBitReorder<PxU32, PxU32>(data, offsetsPerWarp, reordered, passIndex, partialSums, length, cumulativeSum, dependentData, dependentDataReordered);
+}
+
+extern "C" __global__ __launch_bounds__(1024, 1) void reorderKernel(const float4* data, float4* reordered, const PxU32 length, const PxU32* reorderedToOriginalMap)
+{
+	int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
+	if (id >= length)
+		return;
+
+	reordered[id] = data[reorderedToOriginalMap[id]];
+}
+
--- a/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/anisotropy.cu
+++ b/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/anisotropy.cu
@@ -0,0 +1,629 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "vector_types.h"
+#include "foundation/PxVec3.h"
+#include "foundation/PxVec4.h"
+
+#include "matrixDecomposition.cuh"
+#include "PxParticleGpu.h"
+#include "PxgAnisotropyData.h"
+#include "sparseGridStandalone.cuh"
+
+#define ENABLE_KERNEL_LAUNCH_ERROR_CHECK 0
+
+extern "C" __host__ void initAnisotropyKernels0() {}
+
+__device__ inline PxVec3 PxLoad3(const float4& v) { float4 tmp = v; return PxVec3(tmp.x, tmp.y, tmp.z); }
+__device__ inline PxVec4 PxLoad4(const float4& v) { float4 tmp = v; return PxVec4(tmp.x, tmp.y, tmp.z, tmp.w); }
+
+__device__ inline PxReal cube(PxReal x) { return x * x * x; }
+__device__ inline PxReal Wa(PxReal x, PxReal invr)
+{
+	return 1.f - cube(x*invr);
+}
+template <typename V, typename T>
+__device__ inline V Lerp(const V& start, const V& end, const T& t)
+{
+	return start + (end - start) * t;
+}
+template <typename V, typename T>
+__device__ inline V Clamp(const V& a, const T s, const T t) {
+	return V(PxMin(t, PxMax(s, a[0])),
+		PxMin(t, PxMax(s, a[1])),
+		PxMin(t, PxMax(s, a[2])));
+}
+
+//
+//extern "C" __global__ void smoothPositionsLaunch(PxU32* sortedOrder, PxU32* cellEnds, PxU float4* pose, PxU32* phase, float particleContactDistance)
+//{
+//	const float4* const PX_RESTRICT sortedPose = reinterpret_cast<float4*>(particleSystem.mSortedPositions_InvMass);
+//	const PxU32* const PX_RESTRICT sortedPhases = particleSystem.mSortedPhaseArray;
+//
+//	const PxU32* const PX_RESTRICT cellStart = particleSystem.mCellStart;
+//	const PxU32* const PX_RESTRICT cellEnd = particleSystem.mCellEnd;
+//
+//	const PxReal particleContactDistanceSq = particleContactDistance * particleContactDistance;
+//	const PxReal particleContactDistanceInv = 1.0f / particleContactDistance;
+//
+//	// calculated the sum of weights and weighted avg position for particle neighborhood
+//	PxVec3 xs(0.0f); //sum of positions
+//	PxReal ws = 0.0f; //sum of weights
+//	for (int z = -1; z <= 1; z++)
+//	{
+//		for (int y = -1; y <= 1; y++)
+//		{
+//			for (int x = -1; x <= 1; x++)
+//			{
+//				int3 neighbourPos = make_int3(gridPos.x + x, gridPos.y + y, gridPos.z + z);
+//				PxU32 gridHash = calcGridHash(neighbourPos, gridSize);
+//				PxU32 startIndex = cellStart[gridHash];
+//				PxU32 endIndex = cellEnd[gridHash];
+//
+//				if (startIndex != EMPTY_CELL)
+//				{
+//					PxU32 nextPhase = sortedPhases[startIndex];
+//					float4 nextPos = fetch(&sortedPose[startIndex]);
+//
+//					for (PxU32 particleIndex1 = startIndex; particleIndex1 < endIndex /*&& numCollidedParticles < maxNeighborhood*/; particleIndex1++)
+//					{
+//						const PxU32 phase2 = nextPhase;
+//						const float4 pos2 = nextPos;
+//
+//						if ((particleIndex1 + 1) < endIndex)
+//						{
+//							nextPhase = sortedPhases[particleIndex1 + 1];
+//							nextPos = fetch(&sortedPose[particleIndex1 + 1]);
+//						}
+//
+//						if (phase2 & validPhaseMask)
+//						{
+//							const PxVec3 xj = PxLoad3(pos2);
+//							const PxVec3 xij = xi - xj;
+//
+//							const PxReal dsq = xij.magnitudeSquared();
+//
+//							if (0.0f < dsq && dsq < particleContactDistanceSq)
+//							{
+//								const PxReal w = Wa(sqrtf(dsq), particleContactDistanceInv);
+//								ws += w;
+//								xs += xj * w;
+//							}
+//						}
+//					}
+//				}
+//			}
+//		}
+//	}
+//
+//	if (ws > 0.f)
+//	{
+//		PxReal f = 4.0f*Wa(particleContactDistance*0.5f, particleContactDistanceInv);
+//		PxReal smooth = PxMin(1.0f, ws / f)*smoothing;
+//		xs /= ws;
+//		xi = Lerp(xi, xs, smooth);
+//	}
+//
+//	//write smoothed positions back in API order
+//	smoothedPositions[origIdx] = make_float4(xi.x, xi.y, xi.z, xi4.w);
+//}
+
+// Smooths particle positions in a fluid by moving them closer to the weighted
+// average position of their local neighborhood
+extern "C" __global__ void smoothPositionsLaunch(PxGpuParticleSystem* particleSystems, const PxU32 id, PxSmoothedPositionData* smoothingDataPerParticleSystem)
+{
+	PxGpuParticleSystem& particleSystem = particleSystems[id];
+
+	const PxU32 numParticles = particleSystem.mCommonData.mNumParticles;
+
+	PxSmoothedPositionData& data = smoothingDataPerParticleSystem[id];
+	const PxReal smoothing = data.mSmoothing;
+
+	//pointers to global memory buffers -- inputs
+	const float4* PX_RESTRICT sortedNewPositions = particleSystem.mSortedPositions_InvMass;
+	const PxU32* PX_RESTRICT phases = particleSystem.mSortedPhaseArray;
+	const PxU32* PX_RESTRICT collisionIndex = particleSystem.mCollisionIndex;
+	const PxU32* PX_RESTRICT gridParticleIndices = particleSystem.mSortedToUnsortedMapping;
+
+	//pointers to smoothed position buffers -- outputs
+	float4* PX_RESTRICT smoothPosOrig = reinterpret_cast<float4*>(data.mPositions); // particleSystem.mSmoothedOriginPos_InvMass;
+
+	const PxU32 globalThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;
+
+	if (globalThreadIdx >= numParticles)
+		return; //skip thread if it's past the end of particle list
+
+	const PxU32 p = globalThreadIdx; //index of particle in sorted order
+	const PxU32 origIdx = gridParticleIndices[globalThreadIdx];
+
+	const PxVec4 xi4 = PxLoad4(sortedNewPositions[p]);
+
+	//ignore smoothing for non-fluid particles & write original position
+	if (!PxGetFluid(phases[p]))
+	{
+		PxVec4 x = PxLoad4(sortedNewPositions[p]);
+		smoothPosOrig[origIdx] = make_float4(x.x, x.y, x.z, x.w);
+		return;
+	}
+
+	PxVec3 xi = PxVec3(xi4.x, xi4.y, xi4.z);
+
+	PxU32 contactCount = particleSystem.mParticleSelfCollisionCount[p];
+
+	// calculated the sum of weights and weighted avg position for particle neighborhood
+	PxVec3 xs(0.0f); //sum of positions
+	PxReal ws = 0.0f; //sum of weights
+	for (PxU32 i = 0, offset = p; i < contactCount; ++i, offset += numParticles)
+	{
+		const PxU32 q = collisionIndex[offset];
+		if (PxGetFluid(phases[q])) //ignore non-fluid particles
+		{
+			const PxVec3 xj = PxLoad3(sortedNewPositions[q]);
+			const PxVec3 xij = xi - xj;
+
+			const PxReal dsq = xij.magnitudeSquared();
+
+			if (0.0f < dsq && dsq < particleSystem.mCommonData.mParticleContactDistanceSq)
+			{
+				const PxReal w = Wa(sqrtf(dsq), particleSystem.mCommonData.mParticleContactDistanceInv);
+				ws += w;
+				xs += xj * w;
+			}
+		}
+	}
+
+	if (ws > 0.f)
+	{
+		PxReal f = 4.0f*Wa(particleSystem.mCommonData.mParticleContactDistance*0.5f, particleSystem.mCommonData.mParticleContactDistanceInv);
+		PxReal smooth = PxMin(1.0f, ws / f)*smoothing;
+		xs /= ws;
+		xi = Lerp(xi, xs, smooth);
+	}
+
+	//write smoothed positions back in API order
+	smoothPosOrig[origIdx] = make_float4(xi.x, xi.y, xi.z, xi4.w);
+}
+
+// Calculates Eigen-decomposition of the particle covariance matrix according
+// to "Reconstructing Surfaces of Particle-Based Fluids Using Anisotropic Kernels"
+extern "C" __global__ void calculateAnisotropyLaunch(PxGpuParticleSystem* particleSystems, const PxU32 id, PxAnisotropyData* anisotropyDataPerParticleSystem)
+{
+	PxGpuParticleSystem& particleSystem = particleSystems[id];
+
+	const PxU32 numParticles = particleSystem.mCommonData.mNumParticles;
+
+	//pointers to global memory buffers -- inputs
+	const float4* PX_RESTRICT sortedNewPositions = particleSystem.mSortedPositions_InvMass;
+	const PxU32* PX_RESTRICT phases = particleSystem.mSortedPhaseArray;
+	const PxU32* PX_RESTRICT collisionIndex = particleSystem.mCollisionIndex;
+	const PxU32* PX_RESTRICT gridParticleIndices = particleSystem.mSortedToUnsortedMapping;
+
+	const PxAnisotropyData& anisotropyData = anisotropyDataPerParticleSystem[id];
+	float4* PX_RESTRICT q1 = reinterpret_cast<float4*>(anisotropyData.mAnisotropy_q1);
+	float4* PX_RESTRICT q2 = reinterpret_cast<float4*>(anisotropyData.mAnisotropy_q2);
+	float4* PX_RESTRICT q3 = reinterpret_cast<float4*>(anisotropyData.mAnisotropy_q3);
+
+	const PxU32 globalThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;
+
+	if (globalThreadIdx >= numParticles)
+		return; //skip thread if it's past the end of particle list
+
+	const PxU32 p = globalThreadIdx; //index of particle in sorted order
+	const PxU32 origIdx = gridParticleIndices[globalThreadIdx];
+
+	//ignore anisotropy for non-fluid particles
+	if (!PxGetFluid(phases[p]))
+	{
+		float r = anisotropyData.mAnisotropyMin * particleSystem.mCommonData.mParticleContactDistance;
+		q1[origIdx] = make_float4(1.0f, 0.0f, 0.0f, r);
+		q2[origIdx] = make_float4(0.0f, 1.0f, 0.0f, r);
+		q3[origIdx] = make_float4(0.0f, 0.0f, 1.0f, r);
+		/*if (globalThreadIdx == 0)
+		printf("PxGetFluid\n");*/
+		return;
+	}
+
+	const PxVec3 xi = PxLoad3(sortedNewPositions[p]);
+
+	PxU32 contactCount = particleSystem.mParticleSelfCollisionCount[p];
+
+	// calculated the sum of weights and weighted avg position for particle neighborhood
+	PxVec3 xs(0.f); //sum of positions
+	float ws = 0.f; //sum of weights
+
+	PxU32 nextQ;
+	PxU32 nextNextQ;
+	PxVec4 xj4Next;
+	PxU32 nextPhase;
+
+	PxU32 offset = p;
+
+	if (contactCount > 0)
+	{
+		nextQ = collisionIndex[offset];
+		xj4Next = PxLoad4(sortedNewPositions[nextQ]);
+		nextPhase = phases[nextQ];
+
+		offset += numParticles;
+	}
+	if (contactCount > 1)
+	{
+		nextNextQ = collisionIndex[offset];
+		offset += numParticles;
+	}
+
+	for (PxU32 i = 0; i < contactCount; ++i, offset += numParticles)
+	{
+		const PxVec4 xj4 = xj4Next;
+		const PxU32 phase2 = nextPhase;
+
+		if ((i + 1) < contactCount)
+		{
+			xj4Next = PxLoad4(sortedNewPositions[nextNextQ]);
+			nextPhase = phases[nextNextQ];
+
+			nextQ = nextNextQ;
+
+			if ((i + 2) < contactCount)
+				nextNextQ = collisionIndex[offset];
+
+		}
+		if (PxGetFluid(phase2)) //ignore non-fluid particles
+		{
+
+			const PxVec3 xj(xj4.x, xj4.y, xj4.z);
+			const PxVec3 xij = xi - xj;
+
+			const PxReal dsq = xij.magnitudeSquared();
+
+			if (0.f < dsq && dsq < particleSystem.mCommonData.mParticleContactDistanceSq)
+			{
+				const PxReal w = Wa(sqrtf(dsq), particleSystem.mCommonData.mParticleContactDistanceInv);
+				ws += w;
+				xs += xj * w;
+			}
+		}
+	}
+
+	// set to radial and exit early in case of isolated particles
+	if (ws == 0.0f)
+	{
+		float r = anisotropyData.mAnisotropyMin * particleSystem.mCommonData.mParticleContactDistance;
+		q1[origIdx] = make_float4(1.0f, 0.0f, 0.0f, r);
+		q2[origIdx] = make_float4(0.0f, 1.0f, 0.0f, r);
+		q3[origIdx] = make_float4(0.0f, 0.0f, 1.0f, r);
+		//if(globalThreadIdx==0)
+		//printf("%i, ws\n", contactCount);
+		return;
+	}
+
+	//compute inverse sum weight and weight the average position
+	float invWs = 1.f / ws;
+	xs *= invWs;
+
+	PxMat33 covariance(PxVec3(0.0f), PxVec3(0.0f), PxVec3(0.0f));
+
+	offset = p;
+
+	if (contactCount > 0)
+	{
+		nextQ = collisionIndex[offset];
+		xj4Next = PxLoad4(sortedNewPositions[nextQ]);
+		nextPhase = phases[nextQ];
+
+		offset += numParticles;
+	}
+	if (contactCount > 1)
+	{
+		nextNextQ = collisionIndex[offset];
+		offset += numParticles;
+	}
+
+	// use weighted average position to calculate the covariance matrix
+	for (PxU32 i = 0; i < contactCount; ++i, offset += numParticles)
+	{
+		const PxVec4 xj4 = xj4Next;
+		const PxU32 phase2 = nextPhase;
+
+		if ((i + 1) < contactCount)
+		{
+			xj4Next = PxLoad4(sortedNewPositions[nextNextQ]);
+			nextPhase = phases[nextNextQ];
+
+			nextQ = nextNextQ;
+
+			if ((i + 2) < contactCount)
+				nextNextQ = collisionIndex[offset];
+
+		}
+		if (PxGetFluid(phase2)) //ignore non-fluid particles
+		{
+			const PxVec3 xj(xj4.x, xj4.y, xj4.z);
+			const PxVec3 xij = xi - xj;
+
+			const PxReal dsq = xij.magnitudeSquared();
+
+			if (0.f < dsq && dsq < particleSystem.mCommonData.mParticleContactDistanceSq)
+			{
+				const PxReal w = Wa(sqrtf(dsq), particleSystem.mCommonData.mParticleContactDistanceInv);
+				const PxVec3 xjs = xj - xs;
+				covariance += PxMat33::outer(w*xjs, xjs);
+			}
+		}
+	}
+
+	covariance *= invWs;
+
+	//calculate the eigen decomposition
+	PxMat33 r;
+	eigenDecomposition(covariance, r);
+
+	//sanitize the eigen values (diagonal of covariance matrix)
+	covariance[0][0] = max(covariance[0][0], 0.f);
+	covariance[1][1] = max(covariance[1][1], 0.f);
+	covariance[2][2] = max(covariance[2][2], 0.f);
+
+	PxVec3 lambda(sqrtf(covariance[0][0]), sqrtf(covariance[1][1]), sqrtf(covariance[2][2]));
+	//PxVec3 lambda(covariance[0][0], covariance[1][1], covariance[2][2]);
+
+	const float ks = anisotropyData.mAnisotropy;
+	const float kmin = anisotropyData.mAnisotropyMin * particleSystem.mCommonData.mParticleContactDistance;
+	const float kmax = anisotropyData.mAnisotropyMax * particleSystem.mCommonData.mParticleContactDistance;
+
+	lambda *= ks;
+	lambda = Clamp(lambda, kmin, kmax);
+
+	//write out the anisotropy vectors
+	q1[origIdx] = make_float4(r.column0.x, r.column0.y, r.column0.z, lambda.x);
+	q2[origIdx] = make_float4(r.column1.x, r.column1.y, r.column1.z, lambda.y);
+	q3[origIdx] = make_float4(r.column2.x, r.column2.y, r.column2.z, lambda.z);
+}
+
+
+
+
+extern "C" __global__ __launch_bounds__(256, 1)  void anisotropyKernel(const float4* const PX_RESTRICT deviceParticlePos,
+	const PxU32* const PX_RESTRICT sortedToOriginalParticleIndex, const PxU32*  const PX_RESTRICT sortedParticleToSubgrid, PxU32 maxNumSubgrids,
+	const PxU32*  const PX_RESTRICT subgridNeighbors, const PxU32* const PX_RESTRICT subgridEndIndices, int numParticles, PxU32* phases, PxU32 validPhaseMask,
+	float4* q1, float4* q2, float4* q3, PxReal anisotropy, PxReal anisotropyMin, PxReal anisotropyMax, PxReal particleContactDistance)
+{
+	PxI32 threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
+	if (threadIndex >= numParticles)
+		return;
+
+	PxI32 pNr = sortedToOriginalParticleIndex[threadIndex];
+
+	PxU32 subgridIndex = sortedParticleToSubgrid[threadIndex];
+	if (subgridIndex >= maxNumSubgrids || (phases && !(phases[pNr] & validPhaseMask)))
+	{
+		float r = anisotropyMin * particleContactDistance;
+		q1[pNr] = make_float4(1.0f, 0.0f, 0.0f, r);
+		q2[pNr] = make_float4(0.0f, 1.0f, 0.0f, r);
+		q3[pNr] = make_float4(0.0f, 0.0f, 1.0f, r);
+		return;
+	}
+
+	PxVec3 xi = PxLoad3(deviceParticlePos[pNr]);
+
+	const PxReal particleContactDistanceSq = particleContactDistance * particleContactDistance;
+	const PxReal particleContactDistanceInv = 1.0f / particleContactDistance;
+
+	// calculated the sum of weights and weighted avg position for particle neighborhood
+	PxVec3 xs(0.f); //sum of positions
+	float ws = 0.f; //sum of weights
+	for (int z = -1; z <= 1; z++)
+	{
+		for (int y = -1; y <= 1; y++)
+		{
+			for (int x = -1; x <= 1; x++)
+			{
+				PxU32 n = subgridNeighborOffset(subgridNeighbors, subgridIndex, x, y, z);
+				if (n == EMPTY_SUBGRID)
+					continue;
+
+				int start = n == 0 ? 0 : subgridEndIndices[n - 1];
+				int end = subgridEndIndices[n];
+				for (int i = start; i < end; ++i)
+				{
+					int j = sortedToOriginalParticleIndex[i];
+					if (phases && !(phases[j] & validPhaseMask))
+						continue;
+
+					PxVec3 xj = PxLoad3(deviceParticlePos[j]);
+
+					const PxVec3 xij = xi - xj;
+
+					const PxReal dsq = xij.magnitudeSquared();
+
+					if (0.f < dsq && dsq < particleContactDistanceSq)
+					{
+						const PxReal w = Wa(sqrtf(dsq), particleContactDistanceInv);
+						ws += w;
+						xs += xj * w;
+					}
+				}
+			}
+		}
+	}
+
+
+	// set to radial and exit early in case of isolated particles
+	if (ws == 0.0f)
+	{
+		float r = anisotropyMin * particleContactDistance;
+		q1[pNr] = make_float4(1.0f, 0.0f, 0.0f, r);
+		q2[pNr] = make_float4(0.0f, 1.0f, 0.0f, r);
+		q3[pNr] = make_float4(0.0f, 0.0f, 1.0f, r);
+		//if(globalThreadIdx==0)
+		//printf("%i, ws\n", contactCount);
+		return;
+	}
+
+	//compute inverse sum weight and weight the average position
+	float invWs = 1.f / ws;
+	xs *= invWs;
+
+	PxMat33 covariance(PxVec3(0.0f), PxVec3(0.0f), PxVec3(0.0f));
+
+
+	for (int z = -1; z <= 1; z++)
+	{
+		for (int y = -1; y <= 1; y++)
+		{
+			for (int x = -1; x <= 1; x++)
+			{
+				PxU32 n = subgridNeighborOffset(subgridNeighbors, subgridIndex, x, y, z);
+				if (n == EMPTY_SUBGRID)
+					continue;
+
+				int start = n == 0 ? 0 : subgridEndIndices[n - 1];
+				int end = subgridEndIndices[n];
+				for (int i = start; i < end; ++i)
+				{
+					int j = sortedToOriginalParticleIndex[i];
+					if (phases && !(phases[j] & validPhaseMask))
+						continue;
+
+					PxVec3 xj = PxLoad3(deviceParticlePos[j]);
+					const PxVec3 xij = xi - xj;
+
+					const PxReal dsq = xij.magnitudeSquared();
+
+					if (0.f < dsq && dsq < particleContactDistanceSq)
+					{
+						const PxReal w = Wa(sqrtf(dsq), particleContactDistanceInv);
+						const PxVec3 xjs = xj - xs;
+						covariance += PxMat33::outer(w*xjs, xjs);
+					}
+				}
+			}
+		}
+	}
+
+
+	covariance *= invWs;
+
+	//calculate the eigen decomposition
+	PxMat33 r;
+	eigenDecomposition(covariance, r);
+
+	//sanitize the eigen values (diagonal of covariance matrix)
+	covariance[0][0] = max(covariance[0][0], 0.f);
+	covariance[1][1] = max(covariance[1][1], 0.f);
+	covariance[2][2] = max(covariance[2][2], 0.f);
+
+	PxVec3 lambda(sqrtf(covariance[0][0]), sqrtf(covariance[1][1]), sqrtf(covariance[2][2]));
+	//PxVec3 lambda(covariance[0][0], covariance[1][1], covariance[2][2]);
+
+	const float ks = anisotropy;
+	const float kmin = anisotropyMin * particleContactDistance;
+	const float kmax = anisotropyMax * particleContactDistance;
+
+	lambda *= ks;
+	lambda = Clamp(lambda, kmin, kmax);
+
+	//write out the anisotropy vectors
+	q1[pNr] = make_float4(r.column0.x, r.column0.y, r.column0.z, lambda.x);
+	q2[pNr] = make_float4(r.column1.x, r.column1.y, r.column1.z, lambda.y);
+	q3[pNr] = make_float4(r.column2.x, r.column2.y, r.column2.z, lambda.z);
+}
+
+extern "C" __global__ void smoothPositionsKernel(float4* deviceParticlePos, PxU32* sortedToOriginalParticleIndex, PxU32* sortedParticleToSubgrid, PxU32 maxNumSubgrids,
+	PxU32* subgridNeighbors, PxU32* subgridEndIndices, int numParticles, PxU32* phases, PxU32 validPhaseMask, float4* smoothPos, PxReal smoothing, PxReal particleContactDistance)
+{
+	PxI32 threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
+	if (threadIndex >= numParticles)
+		return;
+
+	PxI32 pNr = sortedToOriginalParticleIndex[threadIndex];
+	float4 xi4 = deviceParticlePos[pNr];
+
+	PxU32 subgridIndex = sortedParticleToSubgrid[threadIndex];
+	if (subgridIndex >= maxNumSubgrids || (phases && !(phases[pNr] & validPhaseMask)))
+	{
+		smoothPos[pNr] = xi4;
+		return;
+	}
+
+	PxVec3 xi = PxLoad3(deviceParticlePos[pNr]);
+
+	const PxReal particleContactDistanceSq = particleContactDistance * particleContactDistance;
+	const PxReal particleContactDistanceInv = 1.0f / particleContactDistance;
+
+	// calculated the sum of weights and weighted avg position for particle neighborhood
+	PxVec3 xs(0.0f); //sum of positions
+	PxReal ws = 0.0f; //sum of weights
+	for (int z = -1; z <= 1; z++)
+	{
+		for (int y = -1; y <= 1; y++)
+		{
+			for (int x = -1; x <= 1; x++)
+			{
+				PxU32 n = subgridNeighborOffset(subgridNeighbors, subgridIndex, x, y, z);
+				if (n == EMPTY_SUBGRID)
+					continue;
+
+				int start = n == 0 ? 0 : subgridEndIndices[n - 1];
+				int end = subgridEndIndices[n];
+				for (int i = start; i < end; ++i)
+				{
+					int j = sortedToOriginalParticleIndex[i];
+					if (phases && !(phases[j] & validPhaseMask))
+						continue;
+
+					PxVec3 xj = PxLoad3(deviceParticlePos[j]);
+
+					//Now do the actual calculation
+					const PxVec3 xij = xi - xj;
+
+					const PxReal dsq = xij.magnitudeSquared();
+
+					if (0.0f < dsq && dsq < particleContactDistanceSq)
+					{
+						const PxReal w = Wa(sqrtf(dsq), particleContactDistanceInv);
+						ws += w;
+						xs += xj * w;
+					}
+				}
+			}
+		}
+	}
+
+	if (ws > 0.f)
+	{
+		PxReal f = 4.0f*Wa(particleContactDistance*0.5f, particleContactDistanceInv);
+		PxReal smooth = PxMin(1.0f, ws / f)*smoothing;
+		xs /= ws;
+		xi = Lerp(xi, xs, smooth);
+	}
+
+	//write smoothed positions back in API order
+	smoothPos[pNr] = make_float4(xi.x, xi.y, xi.z, xi4.w);
+}
+
+
--- a/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/attachments.cuh
+++ b/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/attachments.cuh
@@ -0,0 +1,213 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#ifndef __ATTACHMENTS_CUH__
+#define __ATTACHMENTS_CUH__
+
+#include "foundation/PxVecMath.h"
+
+using namespace physx;
+
+__device__ inline bool isConeLimitedEnabled(float maxAngle, float minDist, float maxDist)
+{
+	return maxAngle >= 0.f || minDist >= 0.f || maxDist >= 0.f;
+}
+
+/**
+ * \param maxAngle The cone opening angle measured from the axis.
+ * \param minDist Minimum distance measure from the cone tip.
+ * \param maxDist Maximum distance measured from the cone tip.
+ * \param relPos Position relative to the cone tip for which the error is computed (the minimum translation
+ *				 vector to get from relPos to the allowed cone volume)
+ */
+__device__ inline PxVec3 computeConeLimitedError(float maxAngle, float minDist, float maxDist, const PxVec3& coneAxis, const PxVec3& relPos)
+{
+	PxReal len = relPos.magnitude();
+
+	// angle constraint
+	PxVec3 dir;
+	if(maxAngle == 0.f)
+	{
+		dir = coneAxis;
+	}
+	else if(maxAngle > 0.f)
+	{
+		dir = (len > 1.0e-6f) ? (relPos / len) : coneAxis;
+		const PxReal cosAngle = dir.dot(coneAxis);
+		PxReal cosMaxAngle;
+		PxReal sinMaxAngle;
+		PxSinCos(maxAngle, sinMaxAngle, cosMaxAngle); // could be precomputed
+		if(cosAngle < cosMaxAngle)					  // if theta > maxAngle
+		{
+			PxVec3 t1 = dir.cross(coneAxis);
+			PxVec3 b1 = coneAxis.cross(t1).getNormalized();
+			dir = cosMaxAngle * coneAxis + sinMaxAngle * b1; // new direction that is "maxAngle" deviated from the world axis.
+		}
+	}
+	else
+	{
+		dir = (len > 1.0e-6f) ? (relPos / len) : coneAxis;
+	}
+
+	// length constraint
+	len = PxClamp(len, minDist, maxDist >= 0.f ? maxDist : FLT_MAX);
+
+	return relPos - len * dir; // ideal relPos = len * dir
+}
+
+PX_FORCE_INLINE __device__ PxVec3 calculateAttachmentDeltaImpulsePGS(const float4& raXn0_biasW, const float4& raXn1_biasW,
+																	 const float4& raXn2_biasW, const float4& velMultiplierXYZ_invMassW,
+																	 const float4& low_high_limits, const float4& worldAxis_angle,
+																	 const PxgVelocityPackPGS& vel0, const PxVec3& linVel1, PxReal invDt,
+																	 PxReal biasFactor, PxVec3& deltaLinVel, PxVec3& deltaAngVel)
+{
+	const PxVec3 raXn0 = PxVec3(raXn0_biasW.x, raXn0_biasW.y, raXn0_biasW.z);
+	const PxVec3 raXn1 = PxVec3(raXn1_biasW.x, raXn1_biasW.y, raXn1_biasW.z);
+	const PxVec3 raXn2 = PxVec3(raXn2_biasW.x, raXn2_biasW.y, raXn2_biasW.z);
+
+	// Compute the normal velocity of the constraint.
+
+	const PxReal velOfRigidAtAttachmentPointX = vel0.linVel.x + vel0.angVel.dot(raXn0);
+	const PxReal velOfRigidAtAttachmentPointY = vel0.linVel.y + vel0.angVel.dot(raXn1);
+	const PxReal velOfRigidAtAttachmentPointZ = vel0.linVel.z + vel0.angVel.dot(raXn2);
+
+	// Definition of errors is as follows
+	// raXn0_biasW.w = ((rigidBodyCoM + comToPoint) - attachedPointLocation).x;
+	// raXn1_biasW.w = ((rigidBodyCoM + comToPoint) - attachedPointLocation).y;
+	// raXn2_biasW.w = ((rigidBodyCoM + comToPoint) - attachedPointLocation).z;
+	const PxReal& positionErrorX = raXn0_biasW.w;
+	const PxReal& positionErrorY = raXn1_biasW.w;
+	const PxReal& positionErrorZ = raXn2_biasW.w;
+
+	// For bias see here https://box2d.org/files/ErinCatto_SequentialImpulses_GDC2006.pdf
+	// Slide 22, Bias Impulse
+	PxVec3 velError(linVel1.x - positionErrorX * biasFactor * invDt, linVel1.y - positionErrorY * biasFactor * invDt,
+					linVel1.z - positionErrorZ * biasFactor * invDt);
+
+	if(isConeLimitedEnabled(worldAxis_angle.w, low_high_limits.x, low_high_limits.y))
+	{
+		// we don't understand why we need to scale by biasFactor in order to get the right error offset for the cone.
+		PxReal weirdScale = invDt * biasFactor;
+		PxVec3 posError = velError * (1.0f / weirdScale);
+		const PxVec3 worldAxis(worldAxis_angle.x, worldAxis_angle.y, worldAxis_angle.z);
+		posError = computeConeLimitedError(worldAxis_angle.w, low_high_limits.x, low_high_limits.y, worldAxis, posError);
+		velError = posError * weirdScale;
+	}
+
+	// deltaF for PGS: impulse
+	const PxReal deltaF0 = (velError.x - velOfRigidAtAttachmentPointX) * velMultiplierXYZ_invMassW.x;
+	const PxReal deltaF1 = (velError.y - velOfRigidAtAttachmentPointY) * velMultiplierXYZ_invMassW.y;
+	const PxReal deltaF2 = (velError.z - velOfRigidAtAttachmentPointZ) * velMultiplierXYZ_invMassW.z;
+
+	const PxVec3 deltaImpulse = PxVec3(deltaF0, deltaF1, deltaF2);
+
+	const PxReal invMass0 = velMultiplierXYZ_invMassW.w;
+
+	deltaLinVel = deltaImpulse * invMass0;
+	deltaAngVel = raXn0 * deltaF0 + raXn1 * deltaF1 + raXn2 * deltaF2;
+
+	return deltaImpulse;
+}
+
+template <typename ConstraintType>
+PX_FORCE_INLINE __device__ PxVec3 calculateAttachmentDeltaImpulsePGS(PxU32 offset, const ConstraintType& constraint,
+																	 const PxgVelocityPackPGS& vel0, const PxVec3& linVel1, PxReal invDt,
+																	 PxReal biasFactor, PxVec3& deltaLinVel, PxVec3& deltaAngVel)
+{
+	return calculateAttachmentDeltaImpulsePGS(constraint.raXn0_biasW[offset], constraint.raXn1_biasW[offset], constraint.raXn2_biasW[offset],
+											  constraint.velMultiplierXYZ_invMassW[offset], constraint.low_high_limits[offset],
+											  constraint.axis_angle[offset], vel0, linVel1, invDt, biasFactor, deltaLinVel, deltaAngVel);
+}
+
+PX_FORCE_INLINE __device__ PxVec3 calculateAttachmentDeltaImpulseTGS(const float4& raXn0_biasW, const float4& raXn1_biasW,
+																	 const float4& raXn2_biasW, const float4& velMultiplierXYZ_invMassW,
+																	 const float4& low_high_limits, const float4& worldAxis_angle,
+																	 const PxgVelocityPackTGS& vel0, const PxVec3& linDelta1, PxReal dt,
+																	 PxReal biasCoefficient, bool isVelocityIteration, PxVec3& deltaLinVel,
+																	 PxVec3& deltaAngVel)
+{
+	const PxVec3 raXn0 = PxVec3(raXn0_biasW.x, raXn0_biasW.y, raXn0_biasW.z);
+	const PxVec3 raXn1 = PxVec3(raXn1_biasW.x, raXn1_biasW.y, raXn1_biasW.z);
+	const PxVec3 raXn2 = PxVec3(raXn2_biasW.x, raXn2_biasW.y, raXn2_biasW.z);
+
+	const PxReal velOfRigidAtAttachmentPoint0 = vel0.linVel.x + vel0.angVel.dot(raXn0);
+	const PxReal velOfRigidAtAttachmentPoint1 = vel0.linVel.y + vel0.angVel.dot(raXn1);
+	const PxReal velOfRigidAtAttachmentPoint2 = vel0.linVel.z + vel0.angVel.dot(raXn2);
+
+	const PxVec3 linDelta = linDelta1 - vel0.linDelta;
+
+	// Definition of errors is as follows
+	// raXn0_biasW.w = ((rigidBodyCoM + comToPoint) - attachedPointLocation).x;
+	// raXn1_biasW.w = ((rigidBodyCoM + comToPoint) - attachedPointLocation).y;
+	// raXn2_biasW.w = ((rigidBodyCoM + comToPoint) - attachedPointLocation).z;
+	const PxReal& positionErrorX = raXn0_biasW.w;
+	const PxReal& positionErrorY = raXn1_biasW.w;
+	const PxReal& positionErrorZ = raXn2_biasW.w;
+
+	// This is a position error as well (as opposed to a velocity error for PGS)
+	PxVec3 tgsError(linDelta.x - positionErrorX - vel0.angDelta.dot(raXn0), linDelta.y - positionErrorY - vel0.angDelta.dot(raXn1),
+					linDelta.z - positionErrorZ - vel0.angDelta.dot(raXn2));
+
+	if(isConeLimitedEnabled(worldAxis_angle.w, low_high_limits.x, low_high_limits.y))
+	{
+		const PxVec3 worldAxis(worldAxis_angle.x, worldAxis_angle.y, worldAxis_angle.z);
+		tgsError = computeConeLimitedError(worldAxis_angle.w, low_high_limits.x, low_high_limits.y, worldAxis, tgsError);
+	}
+
+	const PxReal velDt = isVelocityIteration ? 0.f : dt;
+
+	// deltaF for TGS: position delta multiplied by effective inertia
+
+	// Bias coefficient is already multiplied by dt
+	// Bias seems to kind of act like damping
+	const PxReal deltaF0 = (tgsError.x - velOfRigidAtAttachmentPoint0 * velDt) * velMultiplierXYZ_invMassW.x * biasCoefficient;
+	const PxReal deltaF1 = (tgsError.y - velOfRigidAtAttachmentPoint1 * velDt) * velMultiplierXYZ_invMassW.y * biasCoefficient;
+	const PxReal deltaF2 = (tgsError.z - velOfRigidAtAttachmentPoint2 * velDt) * velMultiplierXYZ_invMassW.z * biasCoefficient;
+
+	// const PxVec3 deltaImpulse = (normal0 * deltaF0 + normal1 * deltaF1 + normal2 * deltaF2);
+	const PxVec3 deltaImpulse = PxVec3(deltaF0, deltaF1, deltaF2);
+
+	deltaLinVel = deltaImpulse * velMultiplierXYZ_invMassW.w;
+	deltaAngVel = raXn0 * deltaF0 + raXn1 * deltaF1 + raXn2 * deltaF2;
+
+	return deltaImpulse;
+}
+
+template <typename ConstraintType>
+PX_FORCE_INLINE __device__ PxVec3 calculateAttachmentDeltaImpulseTGS(PxU32 offset, const ConstraintType& constraint,
+																	 const PxgVelocityPackTGS& vel0, const PxVec3& linDelta1, PxReal dt,
+																	 PxReal biasCoefficient, bool isVelocityIteration, PxVec3& deltaLinVel,
+																	 PxVec3& deltaAngVel)
+{
+	return calculateAttachmentDeltaImpulseTGS(constraint.raXn0_biasW[offset], constraint.raXn1_biasW[offset],
+											  constraint.raXn2_biasW[offset], constraint.velMultiplierXYZ_invMassW[offset],
+											  constraint.low_high_limits[offset], constraint.axis_angle[offset], vel0, linDelta1, dt,
+											  biasCoefficient, isVelocityIteration, deltaLinVel, deltaAngVel);
+}
+
+#endif // __ATTACHMENTS_CUH__
--- a/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/bvh.cuh
+++ b/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/bvh.cuh
@@ -0,0 +1,763 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __CU_BVH_CUH__
+#define __CU_BVH_CUH__
+
+#include "foundation/PxVec3.h"
+#include "foundation/PxBounds3.h"
+#include "PxgBVH.h"
+#include "GuDistancePointTriangle.h"
+#include "foundation/PxMath.h"
+
+using namespace physx;
+
+PX_FORCE_INLINE __device__ PxU32 part1by2(PxU32 n)
+{
+	n = (n ^ (n << 16)) & 0xff0000ff;
+	n = (n ^ (n << 8)) & 0x0300f00f;
+	n = (n ^ (n << 4)) & 0x030c30c3;
+	n = (n ^ (n << 2)) & 0x09249249;
+
+	return n;
+}
+
+// Takes values in the range [0, 1] and assigns an index based Morton codes of length 3*log2(Dim) bits 
+template <PxI32 Dim>
+PX_FORCE_INLINE __device__ PxU32 morton3(PxReal x, PxReal y, PxReal z)
+{
+	PxU32 ux = PxClamp(PxI32(x*Dim), 0, Dim - 1);
+	PxU32 uy = PxClamp(PxI32(y*Dim), 0, Dim - 1);
+	PxU32 uz = PxClamp(PxI32(z*Dim), 0, Dim - 1);
+
+	return (part1by2(uz) << 2) | (part1by2(uy) << 1) | part1by2(ux);
+}
+
+
+struct BvhTraversalControl
+{
+	enum Enum
+	{
+		eDontGoDeeper,
+		eGoDeeper,
+		eGoDeeperLowerFirst,
+		eAbort
+	};
+};
+
+template <typename Func>
+PX_FORCE_INLINE __device__ void queryBVH(const PxgBVH& bvh, Func& f, PxI32* stack, PxU32 stackSize)
+{
+	if (bvh.mNumNodes == 0)
+		return;
+
+	PxI32 index = *bvh.mRootNode;
+	PxI32 count = 0;
+
+	const PxU32 maxIter = bvh.mMaxNodes;
+	for(PxU32 iter = 0; iter < maxIter; ++iter)
+	{
+		// union to allow 128-bit loads
+		//union { PxgPackedNodeHalf lower; float4 lowerf; };
+		//union { PxgPackedNodeHalf upper; float4 upperf; };
+
+		PxgPackedNodeHalf lower = bvh.mNodeLowers[index];
+		PxgPackedNodeHalf upper = bvh.mNodeUppers[index];
+		//lowerf = tex1Dfetch<float4>(bvh.mNodeLowersTex, index);
+		//upperf = tex1Dfetch<float4>(bvh.mNodeUppersTex, index);		
+
+		BvhTraversalControl::Enum control = f(lower, upper, index);
+		if (control == BvhTraversalControl::eAbort)
+			break;
+		if (!lower.b && (control == BvhTraversalControl::eGoDeeper || control == BvhTraversalControl::eGoDeeperLowerFirst))
+		{
+			if (control == BvhTraversalControl::eGoDeeperLowerFirst)
+			{
+				if(count < stackSize)
+					stack[count++] = upper.i;
+				index = lower.i; //index gets processed next - assign lower index to it				
+			}
+			else
+			{
+				if (count < stackSize)
+					stack[count++] = lower.i;
+				index = upper.i; //index gets processed next - assign upper index to it	
+			}
+			continue;
+		}
+		if (count == 0)
+			break;
+		index = stack[--count];
+	}
+}
+
+PX_FORCE_INLINE __device__ PxgPackedNodeHalf makeNode(const PxVec3& bound, PxI32 child, bool leaf)
+{
+	PxgPackedNodeHalf n;
+	n.x = bound.x;
+	n.y = bound.y;
+	n.z = bound.z;
+	n.i = (PxU32)child;
+	n.b = (PxU32)(leaf ? 1 : 0);
+
+	return n;
+}
+
+// variation of makeNode through volatile pointers used in BuildHierarchy
+PX_FORCE_INLINE __device__ void makeNode(volatile PxgPackedNodeHalf* n, const PxVec3& bound, PxI32 child, bool leaf)
+{
+	n->x = bound.x;
+	n->y = bound.y;
+	n->z = bound.z;
+	n->i = (PxU32)child;
+	n->b = (PxU32)(leaf ? 1 : 0);
+}
+
+PX_FORCE_INLINE __device__ PxgPackedNodeHalf makeNode(const PxVec3& bound, PxReal w)
+{
+	PxgPackedNodeHalf n;
+	n.x = bound.x;
+	n.y = bound.y;
+	n.z = bound.z;
+
+	reinterpret_cast<float4&>(n).w = w;
+
+	return n;
+}
+
+// this bottom-up process assigns left and right children and combines bounds to form internal nodes
+// there is one thread launched per-leaf node, each thread calculates it's parent node and assigns
+// itself to either the left or right parent slot, the last child to complete the parent and moves
+// up the hierarchy
+template <typename Func>
+PX_FORCE_INLINE __device__ void buildHierarchy(PxI32 n, PxI32* root, PxU32* maxTreeDepth, const PxReal* PX_RESTRICT deltas, PxI32* PX_RESTRICT numChildren,
+	volatile PxI32* PX_RESTRICT rangeLefts, volatile PxI32* PX_RESTRICT rangeRights, volatile PxgPackedNodeHalf* PX_RESTRICT lowers, volatile PxgPackedNodeHalf* PX_RESTRICT uppers, Func& f)
+{
+	PxI32 index = blockDim.x*blockIdx.x + threadIdx.x;
+
+	PxU32 maxDepth = 0;
+
+	if (index < n)
+	{
+		const PxI32 internalOffset = n;
+
+		for (;;)
+		{
+			PxI32 left = rangeLefts[index];
+			PxI32 right = rangeRights[index];
+
+			// check if we are the root node, if so then store out our index and terminate
+			if (left == 0 && right == n - 1)
+			{
+				*root = index;
+				*maxTreeDepth = maxDepth;
+				break;
+			}
+
+			PxI32 childCount = 0;
+
+			PxI32 parent;
+
+			if (left == 0 || (right != n - 1 && deltas[right] < deltas[left - 1]))
+			{
+				parent = right + internalOffset;
+
+				// set parent left child
+				lowers[parent].i = index;
+				rangeLefts[parent] = left;
+
+				childCount = atomicAdd(&numChildren[parent], 1);
+			}
+			else
+			{
+				parent = left + internalOffset - 1;
+
+				// set parent right child
+				uppers[parent].i = index;
+				rangeRights[parent] = right;
+
+				childCount = atomicAdd(&numChildren[parent], 1);
+			}
+
+			// ensure above writes are visible to all threads
+			__threadfence();
+
+			// if we have are the last thread (such that the parent node is now complete)
+			// then update its bounds and move onto the the next parent in the hierarchy
+			if (childCount == 1)
+			{
+				++maxDepth;
+
+				const PxI32 leftChild = lowers[parent].i;
+				const PxI32 rightChild = uppers[parent].i;
+
+				//TODO: float4 loads as in queries?
+				volatile PxgPackedNodeHalf& lowerLeft = lowers[leftChild];
+				PxVec3 leftLower = PxVec3(lowerLeft.x,
+					lowerLeft.y,
+					lowerLeft.z);
+
+				PxVec3 leftUpper = PxVec3(uppers[leftChild].x,
+					uppers[leftChild].y,
+					uppers[leftChild].z);
+
+				volatile PxgPackedNodeHalf& lowerRight = lowers[rightChild];
+				PxVec3 rightLower = PxVec3(lowerRight.x,
+					lowerRight.y,
+					lowerRight.z);
+
+				PxVec3 rightUpper = PxVec3(uppers[rightChild].x,
+					uppers[rightChild].y,
+					uppers[rightChild].z);
+
+				// union of child bounds
+				PxVec3 lower = leftLower.minimum(rightLower);
+				PxVec3 upper = leftUpper.maximum(rightUpper);
+
+				// write new BVH nodes
+				makeNode(lowers + parent, lower, leftChild, false);
+				makeNode(uppers + parent, upper, rightChild, false);
+
+				//Allows to compute additional data per node
+				f(parent, leftChild, lowerLeft, rightChild, lowerRight);
+
+				// move onto processing the parent
+				index = parent;
+			}
+			else
+			{
+				// parent not ready (we are the first child), terminate thread
+				break;
+			}
+		}
+	}
+}
+
+struct EmptyBuilder
+{
+	PX_FORCE_INLINE __device__ EmptyBuilder() {}
+	PX_FORCE_INLINE __device__ void operator()(PxI32 parentId, PxI32 childLeftId, volatile PxgPackedNodeHalf& childLeft, PxI32 childRightId, volatile PxgPackedNodeHalf& childRight)
+	{}
+};
+
+PX_FORCE_INLINE __device__ void buildHierarchy(PxI32 n, PxI32* root, PxU32* maxTreeDepth, const PxReal* PX_RESTRICT deltas, PxI32* PX_RESTRICT numChildren,
+	volatile PxI32* PX_RESTRICT rangeLefts, volatile PxI32* PX_RESTRICT rangeRights, volatile PxgPackedNodeHalf* PX_RESTRICT lowers, volatile PxgPackedNodeHalf* PX_RESTRICT uppers)
+{
+	EmptyBuilder e;
+	buildHierarchy(n, root, maxTreeDepth, deltas, numChildren, rangeLefts, rangeRights, lowers, uppers, e);
+}
+
+
+
+__device__ inline bool intersectRayAABBFast(const PxVec3& pos, const PxVec3& rcp_dir, const PxVec3& min, const PxVec3& max, PxReal& lmin, PxReal& lmax)
+{	
+	PxReal l1 = (min.x - pos.x) * rcp_dir.x;
+	PxReal l2 = (max.x - pos.x) * rcp_dir.x;
+	lmin = PxMin(l1, l2);
+	lmax = PxMax(l1, l2);
+
+	l1 = (min.y - pos.y) * rcp_dir.y;
+	l2 = (max.y - pos.y) * rcp_dir.y;
+	lmin = PxMax(PxMin(l1, l2), lmin);
+	lmax = PxMin(PxMax(l1, l2), lmax);
+
+	l1 = (min.z - pos.z) * rcp_dir.z;
+	l2 = (max.z - pos.z) * rcp_dir.z;
+	lmin = PxMax(PxMin(l1, l2), lmin);
+	lmax = PxMin(PxMax(l1, l2), lmax);
+
+	//return ((lmax > 0.f) & (lmax >= lmin));
+	//return ((lmax > 0.f) & (lmax > lmin));
+	bool hit = ((lmax >= 0.f) & (lmax >= lmin));
+	/*if (hit)
+		t = lmin;*/
+	return hit;
+}
+
+PX_FORCE_INLINE __device__ PxU32 maxAbsDim(PxVec3 dir)
+{
+	dir.x = PxAbs(dir.x);
+	dir.y = PxAbs(dir.y);
+	dir.z = PxAbs(dir.z);
+	if (dir.x >= dir.y && dir.x >= dir.z)
+		return 0;
+	if (dir.y >= dir.x && dir.y >= dir.z)
+		return 1;
+	return 2;
+}
+
+//Specialized implementation guaranteeing watertightness taken from paper "Watertight Ray/Triangle Intersection"
+//https://jcgt.org/published/0002/01/05/paper.pdf
+__device__ inline bool intersectRayTriTwoSidedWatertight(const PxVec3& org, const PxVec3& dir, const PxVec3& a,
+	const PxVec3& b, const PxVec3& c, PxReal& t, PxReal& u, PxReal& v, PxReal& w)
+{
+	//Claculate the dimension where the ray direction is maximal
+	PxU32 kz = maxAbsDim(dir);
+	PxU32 kx = kz + 1; if (kx == 3) kx = 0;
+	PxU32 ky = kx + 1; if (ky == 3) ky = 0;
+
+	//Swap kx and ky dimension to preserve winding direction of triangles
+	if (dir[kz] < 0.0f) 
+		PxSwap(kx, ky);
+
+	//Calculate shear constants
+	PxReal Sx = dir[kx] / dir[kz];
+	PxReal Sy = dir[ky] / dir[kz];
+	PxReal Sz = 1.0f / dir[kz];
+
+
+	//Calculate vertices relative to ray origin
+	const PxVec3 A = a - org;
+	const PxVec3 B = b - org;
+	const PxVec3 C = c - org;
+
+	//Perform shear and scale of vertices
+	const PxReal Ax = A[kx] - Sx * A[kz];
+	const PxReal Ay = A[ky] - Sy * A[kz];
+	const PxReal Bx = B[kx] - Sx * B[kz];
+	const PxReal By = B[ky] - Sy * B[kz];
+	const PxReal Cx = C[kx] - Sx * C[kz];
+	const PxReal Cy = C[ky] - Sy * C[kz];
+
+	//Calculate scaled barycentric coordinates
+	PxReal U = Cx * By - Cy * Bx;
+	PxReal V = Ax * Cy - Ay * Cx;
+	PxReal W = Bx * Ay - By * Ax;
+
+	//Fallback to test against edges using double precision
+	//Happens only in about 1 case out of 1mio tests according to the paper "Watertight Ray/Triangle Intersection"
+	if (U == 0.0f || V == 0.0f || W == 0.0f) 
+	{
+		double CxBy = (double)Cx*(double)By;
+		double CyBx = (double)Cy*(double)Bx;
+		U = (PxReal)(CxBy - CyBx);
+		double AxCy = (double)Ax*(double)Cy;
+		double AyCx = (double)Ay*(double)Cx;
+		V = (PxReal)(AxCy - AyCx);
+		double BxAy = (double)Bx*(double)Ay;
+		double ByAx = (double)By*(double)Ax;
+		W = (PxReal)(BxAy - ByAx);
+	}
+
+	//Perform edge tests. Moving this test before and at the end of the previous conditional gives higher performance
+	if ((U < 0.0f || V < 0.0f || W < 0.0f) &&
+		(U > 0.0f || V > 0.0f || W > 0.0f)) 
+		return false;
+
+	//Calculate determinant
+	PxReal det = U + V + W;
+	if (det == 0.0f) 
+		return false;
+
+	//Calculate scaled z-coordinates of vertices and use them to calculate the hit distance
+	const PxReal Az = Sz * A[kz];
+	const PxReal Bz = Sz * B[kz];
+	const PxReal Cz = Sz * C[kz];
+	const PxReal T = U * Az + V * Bz + W * Cz;
+	
+	//Normalize U, V, W and T
+	const PxReal rcpDet = 1.0f / det;
+	u = U * rcpDet;
+	v = V * rcpDet;
+	w = W * rcpDet;
+	t = T * rcpDet;
+
+	return true;
+}
+
+PX_FORCE_INLINE PxReal __device__ windingNumberForTriangle(const PxVec3& triA, const PxVec3& triB, const PxVec3& triC, const PxVec3& queryPoint)
+{
+	PxVec3 a = triA - queryPoint;
+	PxVec3 b = triB - queryPoint;
+	PxVec3 c = triC - queryPoint;
+		
+	PxReal y = a.dot(b.cross(c));
+
+	PxReal la = a.magnitude();
+	PxReal lb = b.magnitude();
+	PxReal lc = c.magnitude();
+
+	PxReal x = (la * lb * lc + a.dot(b) * lc + b.dot(c) * la + c.dot(a) * lb);
+	PxReal omega = PxAtan2(y, x);
+
+	return (0.5f / PxPi) * omega;
+}
+
+PX_FORCE_INLINE PxReal __device__ firstOrderClusterApproximation(const PxVec3& weightedCentroid, const PxVec3& weightedNormalSum,
+	const PxVec3& evaluationPoint)
+{
+	const PxVec3 dir = weightedCentroid - evaluationPoint;
+	const PxReal l = dir.magnitude();
+	return ((0.25f / PxPi) / (l * l * l)) * weightedNormalSum.dot(dir);
+}
+
+PX_FORCE_INLINE PxReal __device__ radiusOfSphereContainingSubSpheres(const PxVec3& newSphereCenter, const PxVec3& centerA, PxReal radiusA, const PxVec3& centerB, PxReal radiusB)
+{
+	return PxMax((centerA - newSphereCenter).magnitude() + radiusA, (centerB - newSphereCenter).magnitude() + radiusB);
+}
+
+PX_FORCE_INLINE PxVec3 __device__ triangleNormal(const PxVec3& triA, const PxVec3& triB, const PxVec3& triC)
+{
+	return (triB - triA).cross(triC - triA);
+}
+
+PX_FORCE_INLINE PxVec3 __device__ triangleCentroid(const PxVec3& triA, const PxVec3& triB, const PxVec3& triC)
+{
+	const PxReal third = 1.0f / 3.0f;
+	return third * (triA + triB + triC);
+}
+
+PX_FORCE_INLINE __device__ PxgWindingClusterApproximation createWindingClusterApproximation(const PxVec3* PX_RESTRICT vertices, const PxU32* PX_RESTRICT triangle)
+{
+	const PxVec3& triA = vertices[triangle[0]];
+	const PxVec3& triB = vertices[triangle[1]];
+	const PxVec3& triC = vertices[triangle[2]];
+
+	PxgWindingClusterApproximation result;
+	result.mWeightedNormalSum = 0.5f * triangleNormal(triA, triB, triC);
+	result.mAreaSum = result.mWeightedNormalSum.magnitude();
+	result.mCentroidTimesArea = triangleCentroid(triA, triB, triC);
+	result.mRadius = PxSqrt(PxMax(PxMax((triA - result.mCentroidTimesArea).magnitudeSquared(),
+		(triB - result.mCentroidTimesArea).magnitudeSquared()), (triC - result.mCentroidTimesArea).magnitudeSquared()));
+	result.mCentroidTimesArea = result.mAreaSum * result.mCentroidTimesArea;
+	return result;
+}
+
+PX_FORCE_INLINE __device__ PxVec3 clusterCentroid(const PxgWindingClusterApproximation& c)
+{
+	return c.mCentroidTimesArea * (1.0f / c.mAreaSum);
+}
+
+PX_FORCE_INLINE __device__ void combineClusters(const PxgWindingClusterApproximation& a, const PxgWindingClusterApproximation& b, PxgWindingClusterApproximation& result)
+{
+	result.mWeightedNormalSum = a.mWeightedNormalSum + b.mWeightedNormalSum;
+	result.mAreaSum = a.mAreaSum + b.mAreaSum;
+	result.mCentroidTimesArea = a.mCentroidTimesArea + b.mCentroidTimesArea;
+	result.mRadius = radiusOfSphereContainingSubSpheres(clusterCentroid(result), clusterCentroid(a), a.mRadius, clusterCentroid(b), b.mRadius); //This is a conservative approximation (meaning the radius might b a bit too big) but that's fine for the winding number algorithm
+}
+
+//Clusters are not stored for child nodes!
+PX_FORCE_INLINE __device__ PxI32 getClusterIndex(PxI32 bvhNodeIndex, PxU32 numTriangles)
+{
+	PxI32 result = bvhNodeIndex - numTriangles; //The tree is built such that the leave nodes are at the beginning of the array
+	assert(result >= 0);
+	assert(result < numTriangles);
+	/*if (result < 0 || result >= numTriangles)
+		printf("Winding cluster out of range access\n");*/
+	return result;
+}
+
+//Can be passed to the buildHierarchy method to build a winding number hierarchy simultaneously
+struct WindingClusterBuilder
+{
+	PxgWindingClusterApproximation* PX_RESTRICT clusters;
+	const PxVec3* PX_RESTRICT vertices;
+	const PxU32* PX_RESTRICT indices;
+	PxU32 numTriangles;
+
+	PX_FORCE_INLINE __device__ WindingClusterBuilder(PxgWindingClusterApproximation* PX_RESTRICT clusters, const PxVec3* PX_RESTRICT vertices, const PxU32* PX_RESTRICT indices, PxU32 numTriangles)
+		: clusters(clusters), vertices(vertices), indices(indices), numTriangles(numTriangles)
+	{
+	}
+
+	PX_FORCE_INLINE __device__ void operator()(PxI32 parentId, PxI32 childLeftId, volatile PxgPackedNodeHalf& childLeft, PxI32 childRightId, volatile PxgPackedNodeHalf& childRight)
+	{
+		PxgWindingClusterApproximation approxLeft = childLeft.b ? createWindingClusterApproximation(vertices, &indices[3 * childLeft.i]) : clusters[getClusterIndex(childLeftId, numTriangles)];
+		PxgWindingClusterApproximation approxRight = childRight.b ? createWindingClusterApproximation(vertices, &indices[3 * childRight.i]) : clusters[getClusterIndex(childRightId, numTriangles)];
+		combineClusters(approxLeft, approxRight, clusters[getClusterIndex(parentId, numTriangles)]);
+	}
+};
+
+
+struct WindingNumberTraversal
+{
+public:
+	PxReal mWindingNumber = 0;
+	const PxU32* PX_RESTRICT mTriangles;
+	PxU32 mNumTriangles;
+	const PxVec3* PX_RESTRICT mPoints;
+	const PxgWindingClusterApproximation* PX_RESTRICT  mClusters;
+	PxVec3 mQueryPoint;
+	PxReal mDistanceThresholdBeta;
+
+	__device__ WindingNumberTraversal()
+	{
+	}
+
+	__device__ WindingNumberTraversal(const PxU32* PX_RESTRICT triangles, PxU32 numTriangles, const PxVec3* PX_RESTRICT points,
+		const PxgWindingClusterApproximation* PX_RESTRICT clusters, const PxVec3& queryPoint, PxReal distanceThresholdBeta = 2.0f)
+		: mTriangles(triangles), mNumTriangles(numTriangles), mPoints(points), mClusters(clusters), mQueryPoint(queryPoint), mDistanceThresholdBeta(distanceThresholdBeta)
+	{
+	}
+
+	__device__ inline BvhTraversalControl::Enum operator()(const PxgPackedNodeHalf& lower, const PxgPackedNodeHalf& upper, PxI32 nodeIndex)
+	{
+		if (lower.b)
+		{
+			const PxU32* tri = &mTriangles[3 * lower.i];
+			mWindingNumber += windingNumberForTriangle(mPoints[tri[0]], mPoints[tri[1]], mPoints[tri[2]], mQueryPoint);
+			return BvhTraversalControl::eDontGoDeeper;
+		}
+		const PxgWindingClusterApproximation& cluster = mClusters[getClusterIndex(nodeIndex, mNumTriangles)];
+		const PxReal distSquared = (mQueryPoint - clusterCentroid(cluster)).magnitudeSquared();
+		const PxReal threshold = mDistanceThresholdBeta * cluster.mRadius;
+		if (distSquared > threshold * threshold)
+		{
+			mWindingNumber += firstOrderClusterApproximation(clusterCentroid(cluster), cluster.mWeightedNormalSum, mQueryPoint);
+			return BvhTraversalControl::eDontGoDeeper;
+		}
+		return BvhTraversalControl::eGoDeeper;
+	}
+};
+
+
+PX_FORCE_INLINE __device__ PxReal rayTriangleSign(const PxVec3& dir, const PxVec3& a,
+	const PxVec3& b, const PxVec3& c, bool normalize)
+{
+	PxVec3 ab = b - a;
+	PxVec3 ac = c - a;
+	PxVec3 n = ab.cross(ac);
+
+	if (normalize)
+	{
+		PxReal mag2 = n.magnitudeSquared();
+		if (mag2 > 0.0f)
+			n = n * (1.0f / PxSqrt(mag2));
+	}
+
+	return -(dir.dot(n));
+}
+
+struct ClosestRayIntersectionTraversal
+{
+	const PxVec3* PX_RESTRICT meshVertices;
+	const PxU32* PX_RESTRICT meshIndices;
+
+	const PxVec3 origin;
+	const PxVec3 dir;
+	const PxVec3 rcpDir;
+
+	PxReal closestT;
+	PxReal closestDotProduct;
+	bool includeNegativeRayDirection;
+	bool closestPointOnTriangleEdge;
+
+
+	__device__ inline ClosestRayIntersectionTraversal(const PxVec3* PX_RESTRICT meshVertices, const PxU32* PX_RESTRICT meshIndices, const PxVec3& start, const PxVec3& dir, bool includeNegativeRayDirection) :
+		meshVertices(meshVertices), meshIndices(meshIndices),
+		origin(start),
+		dir(dir),
+		rcpDir(1.0f / dir.x, 1.0f / dir.y, 1.0f / dir.z),
+		closestT(FLT_MAX),
+		closestDotProduct(0.0f),
+		includeNegativeRayDirection(includeNegativeRayDirection),
+		closestPointOnTriangleEdge(false)
+	{
+	}
+
+	PX_FORCE_INLINE __device__ bool hasHit()
+	{
+		return closestT < FLT_MAX;
+	}
+
+	__device__ inline BvhTraversalControl::Enum operator()(const PxgPackedNodeHalf& lower, const PxgPackedNodeHalf& upper, PxI32 nodeIndex)
+	{
+		PxReal t;
+		if (lower.b)
+		{
+			// test each element of the rigid body mesh
+			PxU32 tri = lower.i;
+			PxVec3 a = meshVertices[meshIndices[tri * 3 + 0]];
+			PxVec3 b = meshVertices[meshIndices[tri * 3 + 1]];
+			PxVec3 c = meshVertices[meshIndices[tri * 3 + 2]];
+
+			PxReal u, v, w, s;
+			PxVec3 n;
+
+
+			if (intersectRayTriTwoSidedWatertight(origin, dir, a, b, c, t, u, v, w))
+			{
+				s = rayTriangleSign(dir, a, b, c, true);
+				if (includeNegativeRayDirection)
+				{
+					if (t < 0.0f)
+					{
+						t = -t;
+						s = -s;
+					}
+				}
+				if (t > 0.0f && t < closestT)
+				{
+					closestT = t;
+					closestDotProduct = s;
+					closestPointOnTriangleEdge = u == 0.0f || v == 0.0f || w == 0.0f;
+				}
+			}
+
+			return BvhTraversalControl::eDontGoDeeper;
+		}
+
+		//TODO: Does intersectRayAABBFast work for negative t?
+		PxReal tMax;
+		if (intersectRayAABBFast(origin, rcpDir, PxVec3(lower.x, lower.y, lower.z), PxVec3(upper.x, upper.y, upper.z), t, tMax))
+		{
+			if (includeNegativeRayDirection)
+			{
+				if (tMax < 0.0f)
+					t = -tMax;
+			}
+			if (t < closestT)
+				return BvhTraversalControl::eGoDeeper;
+		}
+		return BvhTraversalControl::eDontGoDeeper;
+	}
+};
+
+struct ClosestDistanceToTriangleMeshTraversal
+{
+public:
+	const PxU32* PX_RESTRICT mTriangles;
+	const PxVec3* PX_RESTRICT mPoints;
+	PxVec3 mQueryPoint;
+	PxReal mClosestDistanceSquared;
+
+	__device__ inline ClosestDistanceToTriangleMeshTraversal()
+	{
+	}
+
+	__device__ inline ClosestDistanceToTriangleMeshTraversal(const PxU32* PX_RESTRICT triangles, const PxVec3* PX_RESTRICT points, const PxVec3& queryPoint)
+		: mTriangles(triangles), mPoints(points), mQueryPoint(queryPoint), mClosestDistanceSquared(100000000000.0f)
+	{
+	}
+
+	PX_FORCE_INLINE __device__ PxReal distancePointBoxSquared(const PxVec3& minimum, const PxVec3& maximum, const PxVec3& point)
+	{
+		PxVec3 closestPt = minimum.maximum(maximum.minimum(point));
+		return (closestPt - point).magnitudeSquared();
+	}
+
+	__device__ inline BvhTraversalControl::Enum operator()(const PxgPackedNodeHalf& lower, const PxgPackedNodeHalf& upper, PxI32 nodeIndex)
+	{
+		if (distancePointBoxSquared(PxVec3(lower.x, lower.y, lower.z), PxVec3(upper.x, upper.y, upper.z), mQueryPoint) >= mClosestDistanceSquared)
+			return BvhTraversalControl::eDontGoDeeper;
+
+		if (lower.b)
+		{
+			const PxU32* tri = &mTriangles[3 * lower.i];
+			const PxVec3 a = mPoints[tri[0]];
+			const PxVec3 b = mPoints[tri[1]];
+			const PxVec3 c = mPoints[tri[2]];
+
+			//PxReal s, t;
+			PxVec3 closestPt = Gu::closestPtPointTriangle2UnitBox(mQueryPoint, a, b, c); // closestPtPointTriangle(mQueryPoint, a, b, c, s, t);
+			PxReal distSq = (closestPt - mQueryPoint).magnitudeSquared();
+			if (distSq < mClosestDistanceSquared)
+			{
+				mClosestDistanceSquared = distSq;
+			}
+
+			return BvhTraversalControl::eDontGoDeeper;
+		}
+
+		return BvhTraversalControl::eGoDeeper;
+	}
+};
+
+//Evaluates the winding number and the closest distance in a single query. Might be faster in some scenarios than two separate queries.
+struct WindingNumberAndDistanceTraversal
+{
+public:
+	const PxU32* PX_RESTRICT mTriangles;
+	PxU32 mNumTriangles;
+	PxReal mWindingNumber;
+	const PxVec3* PX_RESTRICT mPoints;
+	const PxgWindingClusterApproximation* mClusters;
+	PxVec3 mQueryPoint;
+	PxReal mDistanceThresholdBeta;
+
+	PxReal mClosestDistance;
+
+	__device__ WindingNumberAndDistanceTraversal()
+	{
+	}
+
+	__device__ WindingNumberAndDistanceTraversal(const PxU32* PX_RESTRICT triangles, PxU32 numTriangles, const PxVec3* PX_RESTRICT points,
+		const PxgWindingClusterApproximation* clusters, const PxVec3& queryPoint, PxReal distanceThresholdBeta = 2.0f)
+		: mTriangles(triangles), mNumTriangles(numTriangles), mWindingNumber(0), mPoints(points), mClusters(clusters), mQueryPoint(queryPoint), mDistanceThresholdBeta(distanceThresholdBeta),
+		mClosestDistance(10000000)
+	{
+	}
+
+	__device__ inline void evaluateLeaf(PxU32 payloadIndex)
+	{
+		const PxU32* tri = &mTriangles[3 * payloadIndex];
+		const PxVec3 a = mPoints[tri[0]];
+		const PxVec3 b = mPoints[tri[1]];
+		const PxVec3 c = mPoints[tri[2]];
+		mWindingNumber += windingNumberForTriangle(a, b, c, mQueryPoint);
+
+		//PxReal s, t;
+		PxVec3 closestPt = Gu::closestPtPointTriangle2UnitBox(mQueryPoint, a, b, c); //closestPtPointTriangle(mQueryPoint, a, b, c, s, t);
+		PxReal distSq = (closestPt - mQueryPoint).magnitudeSquared();
+		if (distSq < mClosestDistance * mClosestDistance)
+		{
+			mClosestDistance = PxSqrt(distSq);
+		}
+	}
+
+	//Do not pass leave nodes into that function
+	__device__ inline BvhTraversalControl::Enum evaluateBranchNode(const PxgPackedNodeHalf& lower, const PxgPackedNodeHalf& upper, PxI32 nodeIndex)
+	{
+		const PxgWindingClusterApproximation& cluster = mClusters[getClusterIndex(nodeIndex, mNumTriangles)];
+		const PxReal dist = (mQueryPoint - clusterCentroid(cluster)).magnitude();
+		if (dist - cluster.mRadius < mClosestDistance)
+		{
+			//Deeper traversal is required
+			return BvhTraversalControl::eGoDeeper;
+		}
+		else if (dist > mDistanceThresholdBeta * cluster.mRadius)
+		{
+			mWindingNumber += firstOrderClusterApproximation(clusterCentroid(cluster), cluster.mWeightedNormalSum, mQueryPoint);
+			return BvhTraversalControl::eDontGoDeeper;
+		}
+		return BvhTraversalControl::eGoDeeper;
+	}
+
+	__device__ inline BvhTraversalControl::Enum operator()(const PxgPackedNodeHalf& lower, const PxgPackedNodeHalf& upper, PxI32 nodeIndex)
+	{
+		if (lower.b)
+		{
+			evaluateLeaf(lower.i);
+			return BvhTraversalControl::eDontGoDeeper;
+		}
+		return evaluateBranchNode(lower, upper, nodeIndex);
+	}
+};
+
+
+#endif
--- a/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/deformableUtils.cuh
+++ b/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/deformableUtils.cuh
@@ -0,0 +1,640 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __DEFORMABLE_UTILS_CUH__
+#define __DEFORMABLE_UTILS_CUH__
+
+#include "foundation/PxMathUtils.h"
+#include "PxsMaterialCombiner.h"
+#include "PxgFEMCore.h"
+#include "PxgFEMCloth.h"
+#include "PxgSoftBody.h"
+#include "PxgArticulation.h"
+#include "PxgBodySim.h"
+#include "dataReadWriteHelper.cuh"
+
+using namespace physx;
+
+//This code is based on Matthias Muller's paper: A robust method to extract the rotational part of deformations
+//Basically, this is another way to extract a rotational matrix from deformation gradient instead of using polar
+//decomposition 
+__device__ inline void extractRotation(const PxMat33 &A, PxQuat& q, int maxIter)
+{
+	const PxReal eps = 1.0e-6f;
+	for (int iter = 0; iter < maxIter; iter++)
+	{
+		PxMat33 R(q);
+		PxVec3 omega = R.column0.cross(A.column0) + R.column1.cross(A.column1) + R.column2.cross(A.column2);
+		// (Cross(R.cols[0], A.cols[0]) + Cross(R.cols[1], A.cols[1]) + Cross(R.cols[2], A.cols[2]));
+
+		//omega *= 1.0f / (fabsf(Dot(R.cols[0], A.cols[0]) + Dot(R.cols[1], A.cols[1]) + Dot(R.cols[2], A.cols[2])) + 1.0e-6f);
+		omega *= 1.0f / (PxAbs(R.column0.dot(A.column0) + R.column1.dot(A.column1) + R.column2.dot(A.column2)) + eps);
+
+		const float w = omega.normalize();
+		const PxQuat tempQ = PxQuat(w, omega);
+		q = tempQ * q;
+		q = q.getNormalized();
+
+		// early-exit after one update (instead of before) since we've already done the expensive computations to find w
+		if (w < eps)
+			break;
+	}
+}
+
+__device__ inline void sb_extractRotationAPD(const PxMat33 &F, PxQuat& q, int maxIter)
+{
+	const PxReal eps = 1.0e-6;
+	const PxReal threshold = 1 - eps;
+	//Use properties of Rodriguez's formula to detect degenerate case of exact 180 deg rotation by checking if the matrix' trace is close to -1
+	//Rodrigues formula for rotation matrices: trace(R) = 1 + 2*cos(theta)
+	//Double3 scaling = new Double3(Math.Max(eps, F.column0.Length), Math.Max(eps, F.column1.Length), Math.Max(eps, F.column2.Length));
+	//bool overwriteGradient = F.column0.x / scaling.x + F.column1.y / scaling.y + F.column2.z / scaling.z < -0.99;   
+	//double wPrev = 0;
+	for (int i = 0; i < maxIter; ++i)
+	{
+		PxMat33 B = PxMat33(q.getConjugate()) * F;
+		PxVec3 gradient = PxVec3(B.column2.y - B.column1.z, B.column0.z - B.column2.x, B.column1.x - B.column0.y);
+		/*if (overwriteGradient)
+		{
+			gradient = new Double3(-2, 0, 0); //Gradient for 90 Degree rotation around x axis, any non-zero gradient should work
+			overwriteGradient = false;
+		}*/
+		if (i == 0 && gradient.magnitudeSquared() < 1e-16)
+		{
+			//If loop got stuck already in first iteration (e. g. rotation around 180 deg around an arbitrary axis), distort gradient
+			gradient = PxVec3(-2, 0, 0); //Gradient for 90 Degree rotation around x axis, any non-zero gradient should work
+		}
+		PxReal h00 = B.column1.y + B.column2.z;
+		PxReal h11 = B.column0.x + B.column2.z;
+		PxReal h22 = B.column0.x + B.column1.y;
+		PxReal h01 = -0.5f * (B.column1.x + B.column0.y);
+		PxReal h02 = -0.5f * (B.column2.x + B.column0.z);
+		PxReal h12 = -0.5f * (B.column2.y + B.column1.z);
+		PxReal detH = -h02 * h02 * h11 + 2.0f * h01 * h02 * h12 - h00 * h12 * h12 - h01 * h01 * h22 + h00 * h11 * h22;
+		PxVec3 omega;
+		PxReal factor = -0.25f / detH;
+		omega.x = factor * ((h11 * h22 - h12 * h12) * gradient.x + (h02 * h12 - h01 * h22) * gradient.y + (h01 * h12 - h02 * h11) * gradient.z);
+		omega.y = factor * ((h02 * h12 - h01 * h22) * gradient.x + (h00 * h22 - h02 * h02) * gradient.y + (h01 * h02 - h00 * h12) * gradient.z);
+		omega.z = factor * ((h01 * h12 - h02 * h11) * gradient.x + (h01 * h02 - h00 * h12) * gradient.y + (h00 * h11 - h01 * h01) * gradient.z);
+		if (fabs(detH) < 1e-9f)
+			omega = -gradient;
+		if (omega.dot(gradient) > 0.0f)
+			omega = gradient * -0.125f;
+		PxReal l_omega2 = omega.magnitudeSquared();
+		PxReal w = (1.0 - l_omega2) / (1.0f + l_omega2);
+		PxVec3 vec = omega * (2.0f / (1.0f + l_omega2));
+		q = q * PxQuat(vec.x, vec.y, vec.z, w);
+		if (w > threshold /*&& wPrev>= w*/)
+			break;
+		//wPrev = w;
+	}
+}
+
+PX_FORCE_INLINE __device__ PxVec3 projectVectorOntoPlane(PxVec3 v, PxVec3 planeNormal)
+{
+	return v - (planeNormal.dot(v) / planeNormal.magnitudeSquared()) * planeNormal;
+}
+
+// Function to compute Lame's parameters (lambda and mu)
+PX_FORCE_INLINE __device__ PxPair<PxReal, PxReal> lameParameters(PxReal Young, PxReal Poisson)
+{
+	const PxReal lambda = Young * Poisson / ((1.0f + Poisson) * (1.0f - 2.0f * Poisson));
+	const PxReal mu = Young / (2.0f * (1.0f + Poisson));
+
+	return PxPair<PxReal, PxReal>(lambda, mu);
+}
+
+PX_FORCE_INLINE __device__ void prepareFEMContacts(PxgFemRigidConstraintBlock& constraint, const PxVec3& normal,
+												   PxgSolverSharedDescBase* sharedDesc, const PxVec3& p, PxReal pen, const PxVec3& delta,
+												   const PxNodeIndex& rigidId, const float4& barycentric,
+												   PxgConstraintPrepareDesc* prepareDesc, PxU32* solverBodyIndices, PxReal penBiasClampFEM,
+												   PxReal invDt, bool isTGS)
+{
+	const PxU32 threadIndexInWarp = threadIdx.x & 31;
+
+	PxAlignedTransform* bodyFrames = prepareDesc->body2WorldPool;
+
+	PxgBodySim* bodySims = sharedDesc->mBodySimBufferDeviceData;
+
+	PxgSolverBodyData* solverBodyData = prepareDesc->solverBodyDataPool;
+	PxgSolverTxIData* solverDataTxIPool = prepareDesc->solverBodyTxIDataPool;
+
+	// Select two tangent vectors to the normal.
+	// Note that the friction behavior may vary depending on the chosen tangent vectors.
+
+	PxVec3 t0, t1;
+	PxComputeBasisVectors(normal, t0, t1);
+
+	PxReal penBiasClampRigid;
+	float4 raXn_resp;
+	float4 raXnF0_resp;
+	float4 raXnF1_resp;
+	PxReal invMass0;
+
+	if(rigidId.isArticulation())
+	{
+		PxU32 nodeIndexA = rigidId.index();
+		PxU32 artiId = bodySims[nodeIndexA].articulationRemapId;
+
+		PxgArticulation& articulation = sharedDesc->articulations[artiId];
+
+		const PxU32 linkID = rigidId.articulationLinkId();
+		const PxTransform body2World = articulation.linkBody2Worlds[linkID];
+		penBiasClampRigid = articulation.links[linkID].initialAngVelXYZ_penBiasClamp.w;
+
+		const PxVec3 bodyFrame0p(body2World.p.x, body2World.p.y, body2World.p.z);
+
+		PxVec3 ra = p - bodyFrame0p;
+		PxVec3 raXn = ra.cross(normal);
+		PxVec3 raXF0 = ra.cross(t0);
+		PxVec3 raXF1 = ra.cross(t1);
+
+		PxSpatialMatrix& spatialResponse = articulation.spatialResponseMatrixW[linkID];
+
+		const Cm::UnAlignedSpatialVector deltaV0 = spatialResponse * Cm::UnAlignedSpatialVector(normal, raXn);
+		const PxReal resp0 = deltaV0.top.dot(raXn) + deltaV0.bottom.dot(normal);
+
+		const Cm::UnAlignedSpatialVector deltaFV0 = spatialResponse * Cm::UnAlignedSpatialVector(t0, raXF0);
+		const Cm::UnAlignedSpatialVector deltaFV1 = spatialResponse * Cm::UnAlignedSpatialVector(t1, raXF1);
+
+		const PxReal respF0 = deltaFV0.top.dot(raXF0) + deltaFV0.bottom.dot(t0);
+		const PxReal respF1 = deltaFV1.top.dot(raXF1) + deltaFV1.bottom.dot(t1);
+
+		raXn_resp = make_float4(raXn.x, raXn.y, raXn.z, resp0);
+		raXnF0_resp = make_float4(raXF0.x, raXF0.y, raXF0.z, respF0);
+		raXnF1_resp = make_float4(raXF1.x, raXF1.y, raXF1.z, respF1);
+
+		// Articulations don't use invMass0. We set it to 1 so we get the linear impulse rather than velocity change.
+		invMass0 = 1.f;
+	}
+	else
+	{
+		PxU32 idx = 0;
+		if(!rigidId.isStaticBody())
+		{
+			idx = solverBodyIndices[rigidId.index()];
+		}
+
+		PxMat33 invSqrtInertia0 = solverDataTxIPool[idx].sqrtInvInertia;
+		const float4 linVel_invMass0 = solverBodyData[idx].initialLinVelXYZ_invMassW;
+		penBiasClampRigid = solverBodyData[idx].initialAngVelXYZ_penBiasClamp.w;
+		invMass0 = linVel_invMass0.w;
+
+		// both static and kinematic object have invMass = 0.f
+		const bool isKinematic = (invMass0 == 0.f) && (!rigidId.isStaticBody());
+
+		PxAlignedTransform bodyFrame0 = bodyFrames[idx];
+		const PxVec3 bodyFrame0p(bodyFrame0.p.x, bodyFrame0.p.y, bodyFrame0.p.z);
+
+		PxVec3 ra = p - bodyFrame0p;
+		PxVec3 raXn = ra.cross(normal);
+		PxVec3 raXF0 = ra.cross(t0);
+		PxVec3 raXF1 = ra.cross(t1);
+
+		const PxVec3 raXnSqrtInertia = invSqrtInertia0 * raXn;
+		const float resp0 = (raXnSqrtInertia.dot(raXnSqrtInertia)) + invMass0;
+
+		const PxVec3 raXF0SqrtInertia = invSqrtInertia0 * raXF0;
+		const PxVec3 raXF1SqrtInertia = invSqrtInertia0 * raXF1;
+
+		const float respF0 = (raXF0SqrtInertia.dot(raXF0SqrtInertia)) + invMass0;
+		const float respF1 = (raXF1SqrtInertia.dot(raXF1SqrtInertia)) + invMass0;
+
+		if(isKinematic)
+		{
+			raXn_resp = make_float4(raXn.x, raXn.y, raXn.z, resp0);
+			raXnF0_resp = make_float4(raXF0.x, raXF0.y, raXF0.z, respF0);
+			raXnF1_resp = make_float4(raXF1.x, raXF1.y, raXF1.z, respF1);
+		}
+		else
+		{
+			raXn_resp = make_float4(raXnSqrtInertia.x, raXnSqrtInertia.y, raXnSqrtInertia.z, resp0);
+			raXnF0_resp = make_float4(raXF0SqrtInertia.x, raXF0SqrtInertia.y, raXF0SqrtInertia.z, respF0);
+			raXnF1_resp = make_float4(raXF1SqrtInertia.x, raXF1SqrtInertia.y, raXF1SqrtInertia.z, respF1);
+		}
+	}
+
+	PxReal maxPenBias = fmaxf(penBiasClampRigid, penBiasClampFEM);
+	PxReal error = pen + delta.dot(normal);
+
+	// KS - TODO - split these into 5 separate vectors to promote coalesced memory accesses!
+	constraint.normal_errorW[threadIndexInWarp] = make_float4(normal.x, normal.y, normal.z, error);
+	constraint.raXn_resp[threadIndexInWarp] = raXn_resp;
+	constraint.raXnF0_resp[threadIndexInWarp] = raXnF0_resp;
+	constraint.raXnF1_resp[threadIndexInWarp] = raXnF1_resp;
+	constraint.fricTan0_invMass0[threadIndexInWarp] = make_float4(t0.x, t0.y, t0.z, invMass0);
+	constraint.maxPenBias[threadIndexInWarp] = maxPenBias;
+	constraint.barycentric[threadIndexInWarp] = barycentric;
+}
+
+// Vec: PxVec3 for triangles and PxVec4 for tetrahedra.
+template <typename Vec>
+struct FEMCollision
+{
+	bool isTGS = true;
+
+	// Rigid body
+	PxU32 rigidBodyReferenceCount = 1;
+	PxReal rigidBodyFriction = 0.0f;
+	PxI32 frictionCombineMode;
+
+	// Deformable body
+	PxReal deformableFriction = 0.0f;
+	PxVec3 deformableLinDelta;
+	Vec deformableVertexInvMasses; // After mass-splitting.
+
+	// Constraint
+	PxVec3 normal = PxVec3(0.0f);
+	PxVec3 tangent = PxVec3(0.0f);
+	PxVec3 raXn = PxVec3(0.0f); 
+	PxVec3 raXt = PxVec3(0.0f);
+	PxReal deltaLambdaN = 0.0f;
+	PxReal deltaLambdaT = 0.0f;
+	PxReal accumulatedDeltaLambdaN = 0.0f;
+
+	PX_FORCE_INLINE __device__ bool initialize(const float4& fricTan0_invMass0, PxgFemRigidConstraintBlock& constraint,
+											   PxReal appliedForceRef, PxNodeIndex rigidId, PxgVelocityReader& velocityReader, PxReal dt,
+											   const Vec& bc, bool wasActive, bool checkOnlyActivity)
+	{
+		// PxgFemRigidConstraintBlock is packed with arrays with size 32.
+		assert(blockDim.x % 32 == 0 && blockDim.y == 1 && blockDim.z == 1);
+
+		// PBD way of appying constraints
+		const PxU32 threadIndexInWarp = threadIdx.x & 31;
+
+		accumulatedDeltaLambdaN = appliedForceRef;
+
+		const PxReal threshold = 1.0e-14f;
+		const PxReal invDt = 1.0f / dt;
+
+		const float4 raXn_resp = constraint.raXn_resp[threadIndexInWarp];
+		const float4 normal_biasW = constraint.normal_errorW[threadIndexInWarp];
+		const float4 raXnF0_resp = constraint.raXnF0_resp[threadIndexInWarp];
+		const float4 raXnF1_resp = constraint.raXnF1_resp[threadIndexInWarp];
+		const PxReal maxPenBias = constraint.maxPenBias[threadIndexInWarp];
+
+		normal = PxVec3(normal_biasW.x, normal_biasW.y, normal_biasW.z);
+		const PxVec3 fric0 = PxVec3(fricTan0_invMass0.x, fricTan0_invMass0.y, fricTan0_invMass0.z);
+		const PxVec3 fric1 = normal.cross(fric0);
+
+		const float initPen = normal_biasW.w;
+
+		raXn = PxVec3(raXn_resp.x, raXn_resp.y, raXn_resp.z);
+
+		PxReal CN;
+		PxReal normalVel;
+		PxVec3 relLinDelta;
+		PxVec3 angDelta(0.0f);
+		PxVec3 linVel;
+		PxVec3 angVel;
+
+		if(isTGS)
+		{
+			PxgVelocityPackTGS rigidStateVec;
+			velocityReader.readVelocitiesTGS(rigidId, rigidStateVec);
+
+			linVel = rigidStateVec.linVel;
+			angVel = rigidStateVec.angVel;
+
+			normalVel = linVel.dot(normal) + angVel.dot(raXn);
+			relLinDelta = rigidStateVec.linDelta - deformableLinDelta;
+			angDelta = rigidStateVec.angDelta;
+
+			const PxReal error = (initPen + relLinDelta.dot(normal) + angDelta.dot(raXn)) * invDt;
+
+			// maxPenBias is negative.
+			const PxReal errorBiased = PxMax(maxPenBias, error);
+
+			CN = errorBiased + normalVel;
+		}
+		else
+		{
+			PxgVelocityPackPGS rigidStateVec;
+			velocityReader.readVelocitiesPGS(rigidId, rigidStateVec);
+
+			linVel = rigidStateVec.linVel;
+			angVel = rigidStateVec.angVel;
+
+			normalVel = linVel.dot(normal) + angVel.dot(raXn);
+			relLinDelta = -deformableLinDelta;
+
+			const PxReal error = (initPen + relLinDelta.dot(normal)) * invDt;
+
+			// maxPenBias is negative.
+			const PxReal errorBiased = PxMax(maxPenBias, error);
+
+			CN = errorBiased + normalVel;
+		}
+
+		const bool isActive = wasActive || CN < 0.0f;
+		deltaLambdaN = 0.0f;
+
+		if(checkOnlyActivity)
+		{
+			return isActive;
+		}
+
+		if(!isActive)
+		{
+			return false;
+		}
+
+		// Deformable body term in the denominator of the impulse calculation. Also, refer to delta lambda in the XPBD paper.
+		const PxReal deformableInvMass_massSplitting = bc.multiply(bc).dot(deformableVertexInvMasses);
+
+		const PxReal rigidRefCount = static_cast<PxReal>(rigidBodyReferenceCount);
+		const PxReal unitResponse = rigidRefCount * raXn_resp.w + deformableInvMass_massSplitting;
+		const PxReal invDenom = (unitResponse > 0.0f) ? (1.0f / unitResponse) : 0.0f;
+
+		deltaLambdaN = PxMax(-CN * invDenom, -accumulatedDeltaLambdaN);
+		accumulatedDeltaLambdaN += deltaLambdaN;
+
+		// Friction constraint in the tangent direction.
+		const PxVec3 raXnF0 = PxVec3(raXnF0_resp.x, raXnF0_resp.y, raXnF0_resp.z);
+		const PxVec3 raXnF1 = PxVec3(raXnF1_resp.x, raXnF1_resp.y, raXnF1_resp.z);
+
+		const float tanVel0 = linVel.dot(fric0) + angVel.dot(raXnF0);
+		const float tanVel1 = linVel.dot(fric1) + angVel.dot(raXnF1);
+
+		const PxReal CT0 = (fric0.dot(relLinDelta) + angDelta.dot(raXnF0)) * invDt + tanVel0;
+		const PxReal CT1 = (fric1.dot(relLinDelta) + angDelta.dot(raXnF1)) * invDt + tanVel1;
+		const PxVec3 relTanDelta = CT0 * fric0 + CT1 * fric1;
+		const PxReal tanMagSq = relTanDelta.magnitudeSquared();
+
+		if(tanMagSq > threshold)
+		{
+			const PxReal CT = PxSqrt(tanMagSq);
+			const PxReal invTanMag = 1.0f / CT;
+			tangent = relTanDelta * invTanMag;
+
+			const PxReal frac0 = tangent.dot(fric0);
+			const PxReal frac1 = tangent.dot(fric1);
+			raXt = frac0 * raXnF0 + frac1 * raXnF1;
+
+			// Using two precomputed orthonormal tangent directions.
+			const PxReal unitResponseT0 = rigidRefCount * raXnF0_resp.w + deformableInvMass_massSplitting;
+			const PxReal invTanDenom0 = (unitResponseT0 > 0.0f) ? (1.0f / unitResponseT0) : 0.0f;
+
+			const PxReal unitResponseT1 = rigidRefCount * raXnF1_resp.w + deformableInvMass_massSplitting;
+			const PxReal invTanDenom1 = (unitResponseT1 > 0.0f) ? (1.0f / unitResponseT1) : 0.0f;
+
+			PxReal deltaLambdaT0 = CT0 * invTanDenom0;
+			PxReal deltaLambdaT1 = CT1 * invTanDenom1;
+
+			deltaLambdaT = PxSqrt(deltaLambdaT0 * deltaLambdaT0 + deltaLambdaT1 * deltaLambdaT1);
+			deltaLambdaT = -PxMin(deltaLambdaT, getCombinedFriction() * PxAbs(deltaLambdaN));
+
+			assert(deltaLambdaT <= 0.0f);
+		}
+
+		return true;
+	}
+
+	PX_FORCE_INLINE __device__ PxReal computeRigidChange(PxVec3& deltaLinVel0, PxVec3& deltaAngVel0, const PxNodeIndex& rigidId,
+														 PxReal rigidInvMass)
+	{
+		const PxReal rigidRefCount = static_cast<PxReal>(rigidBodyReferenceCount);
+
+		deltaAngVel0 = rigidId.isArticulation() ? raXn * deltaLambdaN + raXt * deltaLambdaT
+												: (raXn * deltaLambdaN + raXt * deltaLambdaT) * rigidRefCount;
+
+		deltaLinVel0 = (normal * deltaLambdaN + tangent * deltaLambdaT) * rigidInvMass * rigidRefCount;
+
+		return accumulatedDeltaLambdaN;
+	}
+
+	PX_FORCE_INLINE __device__ PxReal computeFEMChange(PxVec3& deltaPos, PxReal dt)
+	{
+		deltaPos = -(deltaLambdaN * normal + deltaLambdaT * tangent) * dt;
+		return accumulatedDeltaLambdaN;
+	}
+
+	PX_FORCE_INLINE __device__ PxReal getCombinedFriction()
+	{
+		return PxsCombinePxReal(rigidBodyFriction, deformableFriction, frictionCombineMode);
+	}
+
+	PX_FORCE_INLINE __device__ int getGlobalRigidBodyId(const PxgPrePrepDesc* const prePrepDesc, const PxNodeIndex& rigidId,
+														PxU32 numSolverBodies)
+	{
+		// Following PxgVelocityReader style to read rigid body indices.
+		if(rigidId.isStaticBody())
+		{
+			return -1;
+		}
+
+		const PxU32 solverBodyIdx = prePrepDesc->solverBodyIndices[rigidId.index()];
+
+		// Placing articulation indices at the end of rigid body indices to distinguish between rigid body reference counts and
+		// articulation reference counts.
+		return rigidId.isArticulation() ? static_cast<int>(numSolverBodies + solverBodyIdx) : static_cast<int>(solverBodyIdx);
+	}
+
+	PX_FORCE_INLINE __device__ void readRigidBody(const PxNodeIndex& rigidId, int globalRigidBodyId, PxReal rigidInvMass,
+												  const PxU32* const rigidBodyReferenceCounts, const PxsMaterialData* rigidMaterial)
+	{
+		rigidBodyReferenceCount = 1;
+
+		// Query the reference count for the rigid body.
+		if(rigidBodyReferenceCounts && globalRigidBodyId != -1 && rigidInvMass != 0.0f)
+		{
+			rigidBodyReferenceCount = rigidBodyReferenceCounts[globalRigidBodyId];
+		}
+
+		if(rigidMaterial != NULL)
+		{
+			rigidBodyFriction = rigidMaterial->dynamicFriction;
+			frictionCombineMode = rigidMaterial->fricCombineMode;
+		}
+		else
+		{
+			rigidBodyFriction = 0.0f;
+			frictionCombineMode = PxCombineMode::eMAX;
+		}
+	}
+
+	PX_FORCE_INLINE __device__ void writeRigidBody(float4* rigidDeltaVel, const PxVec3& deltaLinVel0, const PxVec3& deltaAngVel0,
+												   PxU32 workIndex0, PxU32 workIndex1, PxReal count)
+	{
+		rigidDeltaVel[workIndex0] = make_float4(deltaLinVel0.x, deltaLinVel0.y, deltaLinVel0.z, count);
+		rigidDeltaVel[workIndex1] = make_float4(deltaAngVel0.x, deltaAngVel0.y, deltaAngVel0.z, 0.f);
+	}
+
+	PX_FORCE_INLINE __device__ PxVec3 readCloth(const PxgFEMCloth& cloth, PxU32 elementId, const float4& bc,
+												const PxsDeformableSurfaceMaterialData* const materials, bool countReferenceOnly)
+	{
+		// Note: PX_MAX_NB_DEFORMABLE_SURFACE_TRI == PX_MAX_NB_DEFORMABLE_SURFACE_VTX
+		if(elementId == PX_MAX_NB_DEFORMABLE_SURFACE_TRI)
+		{
+			deformableVertexInvMasses = PxVec3(0.0f);
+			return deformableVertexInvMasses;
+		}
+
+		const float4* const PX_RESTRICT clothPosDeltas = cloth.mAccumulatedDeltaPos;
+		float4 clothDelta;
+
+		if(bc.w == 0) // Cloth triangle
+		{
+			const uint4 triVertId = cloth.mTriangleVertexIndices[elementId];
+			const float4 pd0 = clothPosDeltas[triVertId.x];
+			const float4 pd1 = clothPosDeltas[triVertId.y];
+			const float4 pd2 = clothPosDeltas[triVertId.z];
+
+			clothDelta = pd0 * bc.x + pd1 * bc.y + pd2 * bc.z;
+			deformableVertexInvMasses = PxVec3(pd0.w, pd1.w, pd2.w);
+
+			if(!countReferenceOnly)
+			{
+				const PxU16 globalMaterialIndex = cloth.mMaterialIndices[elementId];
+				deformableFriction = materials ? materials[globalMaterialIndex].dynamicFriction : 0.0f;
+
+				// Query the reference count for the cloth.
+				PxVec3 deformableVertexReferenceCount;
+				deformableVertexReferenceCount.x = cloth.mDeltaPos[triVertId.x].w;
+				deformableVertexReferenceCount.y = cloth.mDeltaPos[triVertId.y].w;
+				deformableVertexReferenceCount.z = cloth.mDeltaPos[triVertId.z].w;
+
+				// Mass-splitting
+				deformableVertexInvMasses = deformableVertexInvMasses.multiply(deformableVertexReferenceCount);
+			}
+		}
+		else // Cloth vertex
+		{
+			clothDelta = clothPosDeltas[elementId];
+			deformableVertexInvMasses = PxVec3(clothDelta.w, 0.0f, 0.0f);
+
+			if(!countReferenceOnly)
+			{
+				deformableFriction = materials ? cloth.mDynamicFrictions[elementId] : 0.0f;
+
+				// Query the reference count for the cloth.
+				const PxReal deformableVertexReferenceCount = cloth.mDeltaPos[elementId].w;
+
+				// Mass-splitting
+				deformableVertexInvMasses.x *= deformableVertexReferenceCount;
+			}
+		}
+
+		deformableLinDelta = PxVec3(clothDelta.x, clothDelta.y, clothDelta.z);
+
+		return deformableVertexInvMasses;
+	}
+
+	PX_FORCE_INLINE __device__ void writeCloth(PxgFEMCloth& cloth, PxU32 elementId, const float4& bc, PxVec3 deltaPos)
+	{
+		if(bc.w == 0.f) // Cloth triangle
+		{
+			const float* bcPtr = reinterpret_cast<const float*>(&bc.x);
+			const uint4 triVertInds = cloth.mTriangleVertexIndices[elementId];
+			const PxU32* triVertices = reinterpret_cast<const PxU32*>(&triVertInds.x);
+
+#pragma unroll
+			for(PxU32 it = 0; it < 3; ++it)
+			{
+				if(deformableVertexInvMasses[it] > 0.0f)
+				{
+					const PxVec3 dP = deltaPos * (deformableVertexInvMasses[it] * bcPtr[it]);
+					atomicAdd(&cloth.mDeltaPos[triVertices[it]].x, dP.x);
+					atomicAdd(&cloth.mDeltaPos[triVertices[it]].y, dP.y);
+					atomicAdd(&cloth.mDeltaPos[triVertices[it]].z, dP.z);
+				}
+			}
+		}
+		else // Cloth vertex
+		{
+			if(deformableVertexInvMasses.x > 0.0f)
+			{
+				const PxVec3 dP = deltaPos * deformableVertexInvMasses.x;
+				atomicAdd(&cloth.mDeltaPos[elementId].x, dP.x);
+				atomicAdd(&cloth.mDeltaPos[elementId].y, dP.y);
+				atomicAdd(&cloth.mDeltaPos[elementId].z, dP.z);
+			}
+		}
+	}
+
+	PX_FORCE_INLINE __device__ PxVec4 readSoftBody(const PxgSoftBody& softbody, PxU32 tetId, const float4& bc,
+												 const PxsDeformableVolumeMaterialData* const materials, bool checkOnlyActivity)
+	{
+		if(tetId == PX_MAX_NB_DEFORMABLE_VOLUME_TET)
+		{
+			deformableVertexInvMasses = PxVec4(0.0f);
+			return deformableVertexInvMasses;
+		}
+
+		const float4* const PX_RESTRICT posDeltas = softbody.mSimDeltaPos;
+
+		const uint4 tetrahedronId = softbody.mSimTetIndices[tetId];
+		const float4 pd0 = posDeltas[tetrahedronId.x];
+		const float4 pd1 = posDeltas[tetrahedronId.y];
+		const float4 pd2 = posDeltas[tetrahedronId.z];
+		const float4 pd3 = posDeltas[tetrahedronId.w];
+		
+		const float4 softBodyDelta = pd0 * bc.x + pd1 * bc.y + pd2 * bc.z + pd3 * bc.w;
+		deformableLinDelta = PxVec3(softBodyDelta.x, softBodyDelta.y, softBodyDelta.z);
+		deformableVertexInvMasses = PxVec4(pd0.w, pd1.w, pd2.w, pd3.w);
+
+		if(!checkOnlyActivity)
+		{
+			const PxU16 globalMaterialIndex = softbody.mMaterialIndices[tetId];
+			deformableFriction = materials ? materials[globalMaterialIndex].dynamicFriction : 0.0f;
+
+			// Query the reference count for soft body.
+			PxVec4 deformableVertexReferenceCount;
+			deformableVertexReferenceCount.x = softbody.mSimDelta[tetrahedronId.x].w;
+			deformableVertexReferenceCount.y = softbody.mSimDelta[tetrahedronId.y].w;
+			deformableVertexReferenceCount.z = softbody.mSimDelta[tetrahedronId.z].w;
+			deformableVertexReferenceCount.w = softbody.mSimDelta[tetrahedronId.w].w;
+
+			// Mass-splitting
+			deformableVertexInvMasses = deformableVertexInvMasses.multiply(deformableVertexReferenceCount);
+		}
+
+		return deformableVertexInvMasses;
+	}
+
+	PX_FORCE_INLINE __device__ void writeSoftBody(const PxgSoftBody& softbody, PxU32 tetId, const float4& bc, PxVec3 deltaPos)
+	{
+		const float* bcPtr = reinterpret_cast<const float*>(&bc.x);
+		const uint4 tetrahedronId = softbody.mSimTetIndices[tetId];
+		const PxU32* tetVertices = reinterpret_cast<const PxU32*>(&tetrahedronId.x);
+
+#pragma unroll
+		for(PxU32 it = 0; it < 4; ++it)
+		{
+			if(deformableVertexInvMasses[it] > 0.0f)
+			{
+				const PxVec3 dP = deltaPos * (deformableVertexInvMasses[it] * bcPtr[it]);
+				atomicAdd(&softbody.mSimDelta[tetVertices[it]].x, dP.x);
+				atomicAdd(&softbody.mSimDelta[tetVertices[it]].y, dP.y);
+				atomicAdd(&softbody.mSimDelta[tetVertices[it]].z, dP.z);
+			}
+		}
+	}
+};
+
+#endif // __DEFORMABLE_UTILS_CUH__
--- a/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/denseGridStandalone.cuh
+++ b/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/denseGridStandalone.cuh
@@ -0,0 +1,131 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "vector_types.h"
+#include "foundation/PxSimpleTypes.h"
+#include "PxgDenseGridData.h"
+
+//Should have the same value as the same define in sparseGrid.cuh
+#define EMPTY_SUBGRID 0xffffffff 
+
+using namespace physx;
+
+PX_FORCE_INLINE __device__ __host__ int getCellNr(int numCellsX, int numCellsY, int xi, int yi, int zi)
+{
+	return (zi * numCellsY + yi) * numCellsX + xi;
+}
+PX_FORCE_INLINE __device__ __host__ int getCellNr(const int3& gridSize, int xi, int yi, int zi)
+{
+	return getCellNr(gridSize.x, gridSize.y, xi, yi, zi);
+}
+
+PX_FORCE_INLINE __device__ __host__ int4 getCellCoords(int numCellsX, int numCellsY, int cellNr)
+{
+	int4 result;
+	result.x = cellNr % numCellsX;
+	cellNr /= numCellsX;
+	result.y = cellNr % numCellsY;
+	result.z = cellNr / numCellsY;
+	result.w = -1;
+	return result;
+}
+PX_FORCE_INLINE __device__ __host__ int4 getCellCoords(const int3& gridSize, int cellNr)
+{
+	return getCellCoords(gridSize.x, gridSize.y, cellNr);
+}
+
+//Functions for the PxDenseGridData class - make sure they have the same name and aruments as their counterparts of the sparse grid to simplify templating
+PX_FORCE_INLINE __device__ int4 getGridCoordinates(const PxDenseGridData& data, int threadIndex)
+{
+	return getCellCoords(data.mGridParams.numCellsX, data.mGridParams.numCellsY, threadIndex);
+}
+
+PX_FORCE_INLINE __device__ PxU32 getCellIndex(PxDenseGridData& data, const int4& index, bool applySubgridOrder = true)
+{
+	return getCellNr(data.mGridParams.numCellsX, data.mGridParams.numCellsY, index.x, index.y, index.z);
+}
+
+PX_FORCE_INLINE __device__ PxU32 getCellIndex(PxDenseGridData& data, const int4& index, PxI32 offsetX, PxI32 offsetY, PxI32 offsetZ, bool applySubgridOrder = true)
+{
+	return getCellNr(data.mGridParams.numCellsX, data.mGridParams.numCellsY, index.x + offsetX, index.y + offsetY, index.z + offsetZ);
+}
+
+PX_FORCE_INLINE __device__ PxU32 getCellIndexSafe(PxDenseGridData& data, const int4& index, PxI32 offsetX, PxI32 offsetY, PxI32 offsetZ, bool applySubgridOrder = true)
+{
+	if (index.x + offsetX < 0 || index.y + offsetY < 0 || index.z + offsetZ < 0 || index.x + offsetX >= data.mGridParams.numCellsX || index.y + offsetY >= data.mGridParams.numCellsY || index.z + offsetZ >= data.mGridParams.numCellsZ)
+		return EMPTY_SUBGRID;
+	return getCellNr(data.mGridParams.numCellsX, data.mGridParams.numCellsY, index.x + offsetX, index.y + offsetY, index.z + offsetZ);
+}
+
+PX_FORCE_INLINE __device__ PxReal getGridValue(PxDenseGridData& data, const PxReal* dataSource, const int4& index, PxI32 offsetX, PxI32 offsetY, PxI32 offsetZ)
+{
+	return dataSource[getCellIndex(data, index, offsetX, offsetY, offsetZ)];
+}
+
+//Assumes that 0.0 is a valid value for access outside of the grid
+PX_FORCE_INLINE __device__ PxReal getGridValueSafe(PxDenseGridData& data, const PxReal* dataSource, int4 index, PxI32 offsetX, PxI32 offsetY, PxI32 offsetZ)
+{
+	if (index.x + offsetX < 0 || index.y + offsetY < 0 || index.z + offsetZ < 0 || index.x + offsetX >= data.mGridParams.numCellsX || index.y + offsetY >= data.mGridParams.numCellsY || index.z + offsetZ >= data.mGridParams.numCellsZ)
+		return 0.0f;
+	return dataSource[getCellIndex(data, index, offsetX, offsetY, offsetZ)];
+}
+
+PX_FORCE_INLINE __device__ bool outOfRange(PxDenseGridData& data, const int threadIndex)
+{
+	return threadIndex >= data.maxNumCells();
+}
+
+PX_FORCE_INLINE __device__ bool outOfActiveCells(PxDenseGridData& data, const int threadIndex)
+{
+	return threadIndex >= data.maxNumCells(); //All cells are always active on a dense grid
+}
+
+PX_FORCE_INLINE __device__ bool outOfBounds(PxDenseGridData& data, const int4& index)
+{
+	return index.x >= data.mGridParams.numCellsX - 1 || index.y >= data.mGridParams.numCellsY - 1 || index.z >= data.mGridParams.numCellsZ - 1 || index.x < 0 || index.y < 0 || index.z < 0;
+}
+
+PX_FORCE_INLINE __device__ bool isLastCell(PxDenseGridData& data, const int threadIndex)
+{
+	return threadIndex == (data.mGridParams.numCellsX - 1)*(data.mGridParams.numCellsY - 1)*(data.mGridParams.numCellsZ - 1) - 1;
+}
+
+PX_FORCE_INLINE __device__ PxVec3 getLocation(PxDenseGridData& data, const int4& index)
+{
+	return data.mGridParams.origin + PxVec3(index.x, index.y, index.z) * data.mGridParams.gridSpacing;
+}
+
+PX_FORCE_INLINE __device__ int4 getCellIndexFromParticleAndTransformToLocalCoordinates(PxDenseGridData& data, PxVec3& p)
+{
+	p = p - data.mGridParams.origin;
+	PxReal invDx = 1.0f / data.mGridParams.gridSpacing;
+	PxI32 cxi = (int)PxFloor(p.x * invDx);
+	PxI32 cyi = (int)PxFloor(p.y * invDx);
+	PxI32 czi = (int)PxFloor(p.z * invDx);
+	return make_int4(cxi, cyi, czi, -1);
+}
--- a/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/diffuseParticles.cu
+++ b/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/diffuseParticles.cu
@@ -0,0 +1,552 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+#include "vector_types.h"
+#include "foundation/PxVec3.h"
+#include "foundation/PxVec4.h"
+#include "foundation/PxBounds3.h"
+#include "PxgParticleSystemCore.h"
+#include "PxgParticleSystem.h"
+#include "PxgParticleSystemCoreKernelIndices.h"
+#include "PxgBodySim.h"
+#include "PxgCommonDefines.h"
+#include "reduction.cuh"
+#include "shuffle.cuh"
+#include "stdio.h"
+#include "PxgSolverBody.h"
+#include "PxgSolverCoreDesc.h"
+#include "PxParticleSystem.h"
+#include "assert.h"
+#include "copy.cuh"
+#include "PxgSimulationCoreDesc.h"
+#include "gridCal.cuh"
+#include "particleSystem.cuh"
+#include "atomic.cuh"
+#include "utils.cuh"
+
+using namespace physx;
+
+// simpler kernel for diffuse weighting
+__device__ inline PxReal WDiffuse(const PxReal h, const PxReal invR)
+{
+	return (1.0f - h * invR);
+}
+
+extern "C" __host__ void initDiffuseParticlesKernels0() {}
+
+extern "C" __global__ void ps_updateUnsortedDiffuseArrayLaunch(
+	const PxgParticleSystem * PX_RESTRICT particleSystems,
+	const PxU32 * PX_RESTRICT activeParticleSystems)
+{
+	const PxU32 particleId = activeParticleSystems[blockIdx.z];
+
+	const PxgParticleSystem& particleSystem = particleSystems[particleId];
+
+	const PxU32 bufferIndex = blockIdx.y;
+
+	if (bufferIndex < particleSystem.mNumDiffuseBuffers)
+	{
+		const PxU32 threadIndexInWarp = threadIdx.x & 31;
+
+		float4* PX_RESTRICT unsortedPositions = reinterpret_cast<float4*>(particleSystem.mDiffusePosition_LifeTime);
+		float4* PX_RESTRICT unsortedVels = reinterpret_cast<float4*>(particleSystem.mDiffuseVelocity);
+
+		PxU32 localSum = 0;
+
+		for (PxU32 i = threadIndexInWarp; i < bufferIndex; i += WARP_SIZE)
+		{
+			localSum += particleSystem.mDiffuseSimBuffers[i].mNumDiffuseParticles[0];
+		}
+
+		PxU32 bufferOffset = warpReduction<AddOpPxU32, PxU32>(FULL_MASK, localSum);
+
+		PxgParticleDiffuseSimBuffer& buffer = particleSystem.mDiffuseSimBuffers[bufferIndex];
+
+		int numDiffuseParticles = buffer.mNumDiffuseParticles[0];
+
+		const float4* particles = buffer.mDiffusePositions_LifeTime;
+		const float4* vels = buffer.mDiffuseVelocities;
+
+		const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
+		if (globalThreadIndex >= numDiffuseParticles)
+			return;
+
+		if (globalThreadIndex == 0)
+		{
+			buffer.mStartIndex = bufferOffset;
+		}
+
+		const PxU32 ind = bufferOffset + globalThreadIndex;
+		unsortedPositions[ind] = particles[globalThreadIndex];
+		unsortedVels[ind] = vels[globalThreadIndex];
+	}
+}
+
+extern "C" __global__ void ps_diffuseParticleOneWayCollision(
+	PxgParticleSystem * PX_RESTRICT	particleSystems,
+	const PxU32* PX_RESTRICT			activeParticleSystems,
+	const PxU32 count
+)
+{
+	__shared__ __align__(16) PxU8 particleSystemMemory[sizeof(PxgParticleSystem)];
+	PxgParticleSystem& shParticleSystem = *(reinterpret_cast<PxgParticleSystem*>(particleSystemMemory));
+
+	const PxU32 id = activeParticleSystems[blockIdx.y];
+
+	const PxgParticleSystem& particleSystem = particleSystems[id];
+
+	const uint2* sParticleSystem = reinterpret_cast<const uint2*>(&particleSystem);
+	uint2* dParticleSystem = reinterpret_cast<uint2*>(&shParticleSystem);
+
+	blockCopy<uint2>(dParticleSystem, sParticleSystem, sizeof(PxgParticleSystem));
+	__syncthreads();
+
+	const PxU32 pi = threadIdx.x + blockIdx.x * blockDim.x;
+
+	const PxU32 numParticles = *shParticleSystem.mNumDiffuseParticles;
+
+
+	if (pi >= numParticles)
+		return;
+
+	float4* PX_RESTRICT newPos = reinterpret_cast<float4*>(shParticleSystem.mDiffuseSortedPos_LifeTime);
+
+	const PxgParticleContactInfo* PX_RESTRICT contacts = shParticleSystem.mDiffuseOneWayContactInfos;
+	const PxU32* PX_RESTRICT contactCounts = shParticleSystem.mDiffuseOneWayContactCount;
+
+	const PxU32 contactCount = PxMin(PxgParticleContactInfo::MaxStaticContactsPerParticle, contactCounts[pi]);
+
+	
+	if (contactCount)
+	{
+		PxVec3 posCorr = PxLoad3(newPos[pi]);
+		for (PxU32 c = 0, offset = pi; c < contactCount; ++c, offset += numParticles)
+		{
+			const PxgParticleContactInfo& contact = contacts[offset];
+
+			const PxVec3 surfaceNormal = PxLoad3(contact.mNormal_PenW);
+
+			const PxVec3 deltaP = -surfaceNormal * contact.mNormal_PenW.w;
+			posCorr += deltaP;
+		}
+
+		newPos[pi] = make_float4(posCorr.x, posCorr.y, posCorr.z, newPos[pi].w);
+	}
+}
+
+
+extern "C" __global__ void ps_diffuseParticleUpdatePBF(
+	PxgParticleSystem* PX_RESTRICT				particleSystems,
+	const PxU32*								activeParticleSystems,
+	const PxVec3								gravity,
+	const PxReal								dt)
+{
+	__shared__ __align__(16) PxU8 particleSystemMemory[sizeof(PxgParticleSystem)];
+	PxgParticleSystem& shParticleSystem = *(reinterpret_cast<PxgParticleSystem*>(particleSystemMemory));
+
+	__shared__ int offset[3];
+
+	if (threadIdx.x == 0)
+	{
+		offset[0] = 0; offset[1] = -1; offset[2] = 1;
+	}
+
+	const PxU32 id = activeParticleSystems[blockIdx.y];
+
+	const PxgParticleSystem& particleSystem = particleSystems[id];
+
+	const uint2* sParticleSystem = reinterpret_cast<const uint2*>(&particleSystem);
+	uint2* dParticleSystem = reinterpret_cast<uint2*>(&shParticleSystem);
+
+	blockCopy<uint2>(dParticleSystem, sParticleSystem, sizeof(PxgParticleSystem));
+	__syncthreads();
+
+	{
+		int numDiffuse = *shParticleSystem.mNumDiffuseParticles;			
+
+		const PxU32 pi = threadIdx.x + blockIdx.x * blockDim.x;
+
+		if (pi >= numDiffuse)
+			return;
+
+		const PxU32* const PX_RESTRICT cellStarts = shParticleSystem.mCellStart;
+		const PxU32* const PX_RESTRICT cellEnds = shParticleSystem.mCellEnd;
+
+		// per-particle data
+		const float4* const PX_RESTRICT sortedPose = reinterpret_cast<float4*>(shParticleSystem.mSortedPositions_InvMass);
+		const float4* const PX_RESTRICT sortedVel = reinterpret_cast<float4*>(shParticleSystem.mSortedVelocities);
+
+		float4* PX_RESTRICT diffusePositions = reinterpret_cast<float4*>(shParticleSystem.mDiffuseSortedPos_LifeTime);
+
+		//Overloading this buffer to store the new velocity...
+		float4* PX_RESTRICT newVel = reinterpret_cast<float4*>(shParticleSystem.mDiffuseSortedOriginPos_LifeTime);
+
+		// get elements
+		const float4 xi4 = diffusePositions[pi];
+		const PxVec3 pos = PxLoad3(xi4);
+
+		// interpolate		
+		PxVec3 velAvg(PxZero);
+		PxU32 numNeighbors = 0;
+
+		const PxReal cellWidth = shParticleSystem.mCommonData.mGridCellWidth;
+		const PxReal contactDistanceSq = shParticleSystem.mCommonData.mParticleContactDistanceSq;
+		const PxReal invContactDistance = shParticleSystem.mCommonData.mParticleContactDistanceInv;
+		const int3 gridPos = calcGridPos(xi4, cellWidth);
+		const uint3 gridSize = make_uint3(shParticleSystem.mCommonData.mGridSizeX, shParticleSystem.mCommonData.mGridSizeY, shParticleSystem.mCommonData.mGridSizeZ);
+
+		// Iterate over cell
+		PxReal weightSum = 0.0f;
+		PxVec3 velocitySum(0.f);
+
+		const PxU32 maxNeighbors = 16;
+
+		const PxU32 end = (shParticleSystem.mData.mFlags & PxParticleFlag::eFULL_DIFFUSE_ADVECTION) ? 3 : 1;
+
+
+		for (int z = 0; z < end; ++z)
+			for (int y = 0; y < end; ++y)
+				for (int x = 0; x < end; ++x)
+				{
+					const int3 neighbourPos = make_int3(gridPos.x + offset[x], gridPos.y + offset[y], gridPos.z + offset[z]);
+					const PxU32 gridHash = calcGridHash(neighbourPos, gridSize);
+					const PxU32 startIndex = cellStarts[gridHash];
+
+					if (startIndex != EMPTY_CELL)
+					{
+						const PxU32 endIndex = cellEnds[gridHash];
+						for (PxU32 q = startIndex; q < endIndex; ++q)
+						{
+							const PxVec3 xj = PxLoad3(sortedPose[q]);
+							const PxVec3 xij = pos - xj;
+
+							const PxReal dSq = xij.dot(xij);
+
+							if (dSq < contactDistanceSq)
+							{
+								const PxVec3 vj = PxLoad3(sortedVel[q]);
+								const PxReal w = WDiffuse(sqrtf(dSq), invContactDistance);
+
+								weightSum += w;
+								velocitySum += vj * w;
+
+								++numNeighbors;
+								if (numNeighbors == maxNeighbors)
+									goto weight_sum;
+							}
+						}
+					}
+				}
+
+	weight_sum:
+		if (weightSum > 0)
+			velAvg = velocitySum / weightSum;
+
+		newVel[pi] = make_float4(velAvg.x, velAvg.y, velAvg.z, PxReal(numNeighbors));
+	}
+}
+
+extern "C" __global__ void ps_diffuseParticleCompact(
+	PxgParticleSystem* PX_RESTRICT				particleSystems,
+	const PxU32*								activeParticleSystems,
+	const PxVec3								gravity,
+	const PxReal								dt)
+{
+	__shared__ __align__(16) PxU8 particleSystemMemory[sizeof(PxgParticleSystem)];
+	PxgParticleSystem& shParticleSystem = *(reinterpret_cast<PxgParticleSystem*>(particleSystemMemory));
+
+	const PxU32 id = activeParticleSystems[blockIdx.z];
+
+	const PxgParticleSystem& particleSystem = particleSystems[id];
+
+	const uint2* sParticleSystem = reinterpret_cast<const uint2*>(&particleSystem);
+	uint2* dParticleSystem = reinterpret_cast<uint2*>(&shParticleSystem);
+
+	blockCopy<uint2>(dParticleSystem, sParticleSystem, sizeof(PxgParticleSystem));
+    __syncthreads();
+
+	const PxU32 bufferIndex = blockIdx.y;
+	if (bufferIndex < shParticleSystem.mNumDiffuseBuffers)
+	{
+
+		PxgParticleDiffuseSimBuffer& buffer = shParticleSystem.mDiffuseSimBuffers[bufferIndex];
+
+		int* numDiffuseParticles = buffer.mNumDiffuseParticles;
+		int numDiffuse = numDiffuseParticles[0];
+
+		const PxU32 pi = threadIdx.x + blockIdx.x * blockDim.x;
+		const PxU32 threadIndexInWarp = threadIdx.x & 31;
+
+		if (pi >= numDiffuse)
+			return;
+
+		float4* PX_RESTRICT diffusePositionsNew = buffer.mDiffusePositions_LifeTime;
+		float4* PX_RESTRICT diffuseVelocitiesNew = buffer.mDiffuseVelocities;
+
+		float4* PX_RESTRICT velAvgs = reinterpret_cast<float4*>(shParticleSystem.mDiffuseSortedOriginPos_LifeTime);
+
+		float4* PX_RESTRICT diffusePositions = reinterpret_cast<float4*>(shParticleSystem.mDiffuseSortedPos_LifeTime);
+		float4* PX_RESTRICT diffusePositionsOld = reinterpret_cast<float4*>(shParticleSystem.mDiffuseOriginPos_LifeTime);
+		
+		const PxU32* reverseLookup = shParticleSystem.mDiffuseUnsortedToSortedMapping;
+		
+		const PxU32 index = pi + buffer.mStartIndex;
+		const PxU32 sortedInd = reverseLookup[index];
+
+		// get elements
+		const float4 xi4 = diffusePositions[sortedInd];
+		const float4 vi4Old = diffusePositionsOld[index];
+		const float4 xiva4 = velAvgs[sortedInd];
+		const PxVec3 pos = PxLoad3(xi4);
+		const PxVec3 oldPos = PxLoad3(vi4Old);
+		const PxVec3 velAvg = PxLoad3(xiva4);
+
+		const PxReal lifeDelta = dt;
+
+		PxVec3 vel = (pos - oldPos)*(1.f / dt);
+
+		// integrate diffuse particle
+		PxVec3 newVel;
+		if (xiva4.w < 4.f)
+		{
+			// spray (ballistic)
+			newVel = vel * (1.0f - buffer.mParams.airDrag * dt);
+		}
+		else if (xiva4.w < 8.f)
+		{
+			// foam
+			newVel = velAvg;
+		}
+		else
+		{
+			// bubble
+			newVel = vel - (1.f + buffer.mParams.buoyancy) * gravity * dt + buffer.mParams.bubbleDrag * (velAvg - vel);
+		}
+
+		const float maxVel = shParticleSystem.mData.mMaxVelocity;
+		if (newVel.magnitudeSquared() > 0)
+		{
+			newVel = PxMin(newVel.magnitude(), maxVel) * newVel.getNormalized();
+		}
+
+		PxVec3 newPosCorr = pos + (newVel - vel) * dt;
+		PxVec3 newVelCorr = newVel;
+
+		__syncwarp();
+
+		const PxReal lifeTime = fmaxf(xi4.w - lifeDelta, 0.0f);
+
+		PxU32 res = __ballot_sync(FULL_MASK, lifeTime > 0.f);
+
+		PxU32 offset = 0;
+
+		if (threadIndexInWarp == 0)
+			offset = atomicAdd(&numDiffuseParticles[1], __popc(res));
+
+		offset = __shfl_sync(FULL_MASK, offset, 0);
+
+
+
+		if (lifeTime > 0.f)
+		{
+			PxU32 newIndex = offset + warpScanExclusive(res, threadIndexInWarp);
+			
+			diffusePositionsNew[newIndex] = make_float4(newPosCorr.x, newPosCorr.y, newPosCorr.z, lifeTime);
+			diffuseVelocitiesNew[newIndex] = make_float4(newVelCorr.x, newVelCorr.y, newVelCorr.z, 0.0f);
+		}
+	}
+}
+
+extern "C" __global__ void ps_diffuseParticleCreate(
+	PxgParticleSystem * PX_RESTRICT			particleSystems,
+	const PxU32* const PX_RESTRICT				activeParticleSystems,
+	const PxReal* const PX_RESTRICT			randomTable,
+	const PxU32									randomTableSize,
+	const PxReal								dt)
+{
+	__shared__ __align__(16) PxU8 particleSystemMemory[sizeof(PxgParticleSystem)];
+	PxgParticleSystem& shParticleSystem = *(reinterpret_cast<PxgParticleSystem*>(particleSystemMemory));
+
+	const PxU32 id = activeParticleSystems[blockIdx.z];
+	const PxgParticleSystem& particleSystem = particleSystems[id];
+
+	const uint2* sParticleSystem = reinterpret_cast<const uint2*>(&particleSystem);
+	uint2* dParticleSystem = reinterpret_cast<uint2*>(&shParticleSystem);
+
+	blockCopy<uint2>(dParticleSystem, sParticleSystem, sizeof(PxgParticleSystem));
+	__syncthreads();
+
+	const PxU32 bufferIndex = blockIdx.y;
+	if (bufferIndex < shParticleSystem.mCommonData.mNumParticleBuffers)
+	{
+
+		const PxgParticleSimBuffer& buffer = shParticleSystem.mParticleSimBuffers[bufferIndex];
+
+		const PxU32 diffuseParticleBufferIndex = buffer.mDiffuseParticleBufferIndex;
+
+		if (diffuseParticleBufferIndex == 0xffffffff)
+			return;
+
+		const PxgParticleSystemData& data = shParticleSystem.mData;
+
+		const PxU32 pi = threadIdx.x + blockIdx.x * blockDim.x;
+
+		const PxU32 numParticles = buffer.mNumActiveParticles;
+
+		if (pi >= numParticles)
+			return;
+
+
+		PxgParticleDiffuseSimBuffer& diffuseBuffer = shParticleSystem.mDiffuseSimBuffers[diffuseParticleBufferIndex];
+
+		if (diffuseBuffer.mMaxNumParticles == 0)
+			return;
+	
+		// get arrays
+		const float4* const PX_RESTRICT sortedPose = reinterpret_cast<float4*>(shParticleSystem.mSortedPositions_InvMass);
+		const float4* const PX_RESTRICT sortedVel = reinterpret_cast<float4*>(shParticleSystem.mSortedVelocities);
+		const PxU32* PX_RESTRICT phases = shParticleSystem.mSortedPhaseArray;
+		const float2* const PX_RESTRICT potentials = reinterpret_cast<float2*>(shParticleSystem.mDiffusePotentials);
+		
+		float4* PX_RESTRICT diffusePositionsNew = diffuseBuffer.mDiffusePositions_LifeTime;
+		float4* PX_RESTRICT diffuseVelocitiesNew = diffuseBuffer.mDiffuseVelocities;
+
+		int* numDiffuseParticles = diffuseBuffer.mNumDiffuseParticles;
+		
+		const PxU32* reverseLookup = shParticleSystem.mUnsortedToSortedMapping;
+		const PxU32 offset = particleSystem.mParticleBufferRunsum[bufferIndex];
+
+		const PxU32 sortedInd = reverseLookup[pi + offset];
+		// get elements
+		const float2 ptnts = potentials[sortedInd];
+		const PxReal threshold = diffuseBuffer.mParams.threshold;
+		const PxU32 phase = phases[sortedInd];
+
+		if (!PxGetFluid(phase))
+			return;
+
+		const float4 vi4 = sortedVel[sortedInd];
+
+		//Kinetic energy + pressure
+		const PxReal kineticEnergy = dot3(vi4, vi4) * diffuseBuffer.mParams.kineticEnergyWeight;
+		const PxReal divergence = diffuseBuffer.mParams.divergenceWeight * ptnts.x;
+		const PxReal pressure = diffuseBuffer.mParams.pressureWeight * ptnts.y;
+		PxReal intensity = pressure - divergence + kineticEnergy;
+
+		//if (pi == 0)
+		//	printf("numParticles %i diffuseParticleBufferIndex %i numDiffuseParticles[1] %i threshold %f\n", numParticles, diffuseParticleBufferIndex, numDiffuseParticles[1], threshold);
+
+		const PxReal r0 = randomTable[(sortedInd + 0) % randomTableSize];
+
+		if(r0 * intensity > threshold)
+		{
+			const float4 xi4 = sortedPose[sortedInd];
+			
+
+			//for (int i=0; i < 5; ++i)
+			{
+				// try and allocate new diffuse particles
+				const int newIndex = atomicAdd(&numDiffuseParticles[1], 1);
+
+				if (newIndex < diffuseBuffer.mMaxNumParticles)
+				{
+					
+					const PxVec3 xi = PxLoad3(xi4);
+					const PxVec3 vi = PxLoad3(vi4);
+
+					const PxReal r1 = randomTable[(sortedInd + 1) % randomTableSize];
+					const PxReal r2 = randomTable[(sortedInd + 2) % randomTableSize];
+					const PxReal r3 = randomTable[(sortedInd + 3) % randomTableSize];
+
+					const PxReal lifeMin = 1.0f;
+					const PxReal lifeMax = diffuseBuffer.mParams.lifetime;
+					const PxReal lifeScale = fminf(intensity / threshold, 1.f) * r1;
+					const PxReal lifetime = lifeMin + lifeScale * (lifeMax - lifeMin);
+
+					const PxVec3 q = xi - r2 * vi * dt + PxVec3(r1, r2, r3) * data.mRestOffset * 0.25f;
+
+					diffusePositionsNew[newIndex] = make_float4(q.x, q.y, q.z, lifetime);
+					diffuseVelocitiesNew[newIndex] = make_float4(vi.x, vi.y, vi.z, 0.0f);
+				}
+			}
+		}
+	}
+}
+
+
+extern "C" __global__ void ps_diffuseParticleCopy(
+	PxgParticleSystem * PX_RESTRICT	particleSystems,
+	const PxU32* const PX_RESTRICT	activeParticleSystems,
+	const PxU32 count)
+{
+	const PxU32 id = activeParticleSystems[blockIdx.z];
+	PxgParticleSystem& particleSystem = particleSystems[id];
+
+	const PxU32 numDiffuseBuffers = particleSystem.mNumDiffuseBuffers;
+
+	const PxU32 bufferIndex = blockIdx.y;
+	if (bufferIndex < numDiffuseBuffers)
+	{
+		PxgParticleDiffuseSimBuffer& diffuseBuffer = particleSystem.mDiffuseSimBuffers[bufferIndex];
+
+		int* numDiffuseParticles = diffuseBuffer.mNumDiffuseParticles;
+		const PxU32 numDiffuse = PxMin(PxI32(diffuseBuffer.mMaxNumParticles), numDiffuseParticles[1]);
+		*diffuseBuffer.mNumActiveDiffuseParticles = numDiffuse; //pinned memory
+		numDiffuseParticles[0] = numDiffuse;
+		numDiffuseParticles[1] = 0;
+
+	}
+}
+
+
+extern "C" __global__ void ps_diffuseParticleSum(
+	PxgParticleSystem * PX_RESTRICT	particleSystems,
+	const PxU32* const PX_RESTRICT	activeParticleSystems,
+	const PxU32 count)
+{
+	const PxU32 id = activeParticleSystems[blockIdx.x];
+	PxgParticleSystem& particleSystem = particleSystems[id];
+
+	const PxU32 numDiffuseBuffers = particleSystem.mNumDiffuseBuffers;
+
+	PxU32 totalDiffuse = 0;
+	for (PxU32 i = threadIdx.x; i < numDiffuseBuffers; i += WARP_SIZE)
+	{
+		PxgParticleDiffuseSimBuffer& diffuseBuffer = particleSystem.mDiffuseSimBuffers[i];
+		totalDiffuse += diffuseBuffer.mNumDiffuseParticles[0];
+	}
+
+	totalDiffuse = warpReduction<AddOpPxU32, PxU32>(FULL_MASK, totalDiffuse);
+
+
+	if(threadIdx.x == 0)
+	{
+		*particleSystem.mNumDiffuseParticles = totalDiffuse;
+	}
+}
--- a/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/isosurfaceExtraction.cu
+++ b/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/isosurfaceExtraction.cu
--- a/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/marchingCubesTables.cuh
+++ b/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/marchingCubesTables.cuh
@@ -0,0 +1,184 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+
+
+
+//  point numbering
+
+//       7-----------6
+//      /|          /|
+//     / |         / |
+//    /  |        /  |
+//   4-----------5   |
+//   |   |       |   |
+//   |   3-------|---2
+//   |  /        |  /
+//   | /         | /
+//   |/          |/
+//   0-----------1
+
+//  edge numbering
+
+//       *-----6-----*
+//      /|          /|
+//     7 |         5 |
+//    /  11       /  10
+//   *-----4-----*   |
+//   |   |       |   |
+//   |   *-----2-|---*
+//   8  /        9  /
+//   | 3         | 1
+//   |/          |/
+//   *-----0-----*
+
+
+//   z
+//   |  y
+//   | /
+//   |/
+//   0---- x
+
+__constant__ int marchingCubeCorners[8][3] = { {0,0,0}, {1,0,0},{1,1,0},{0,1,0}, {0,0,1}, {1,0,1},{1,1,1},{0,1,1} };
+
+__constant__ int firstMarchingCubesId[257] = {
+0, 0, 3, 6, 12, 15, 21, 27, 36, 39, 45, 51, 60, 66, 75, 84, 90, 93, 99, 105, 114,
+120, 129, 138, 150, 156, 165, 174, 186, 195, 207, 219, 228, 231, 237, 243, 252, 258, 267, 276, 288,
+294, 303, 312, 324, 333, 345, 357, 366, 372, 381, 390, 396, 405, 417, 429, 438, 447, 459, 471, 480,
+492, 507, 522, 528, 531, 537, 543, 552, 558, 567, 576, 588, 594, 603, 612, 624, 633, 645, 657, 666,
+672, 681, 690, 702, 711, 723, 735, 750, 759, 771, 783, 798, 810, 825, 840, 852, 858, 867, 876, 888,
+897, 909, 915, 924, 933, 945, 957, 972, 984, 999, 1008, 1014, 1023, 1035, 1047, 1056, 1068, 1083, 1092, 1098,
+1110, 1125, 1140, 1152, 1167, 1173, 1185, 1188, 1191, 1197, 1203, 1212, 1218, 1227, 1236, 1248, 1254, 1263, 1272, 1284,
+1293, 1305, 1317, 1326, 1332, 1341, 1350, 1362, 1371, 1383, 1395, 1410, 1419, 1425, 1437, 1446, 1458, 1467, 1482, 1488,
+1494, 1503, 1512, 1524, 1533, 1545, 1557, 1572, 1581, 1593, 1605, 1620, 1632, 1647, 1662, 1674, 1683, 1695, 1707, 1716,
+1728, 1743, 1758, 1770, 1782, 1791, 1806, 1812, 1827, 1839, 1845, 1848, 1854, 1863, 1872, 1884, 1893, 1905, 1917, 1932,
+1941, 1953, 1965, 1980, 1986, 1995, 2004, 2010, 2019, 2031, 2043, 2058, 2070, 2085, 2100, 2106, 2118, 2127, 2142, 2154,
+2163, 2169, 2181, 2184, 2193, 2205, 2217, 2232, 2244, 2259, 2268, 2280, 2292, 2307, 2322, 2328, 2337, 2349, 2355, 2358,
+2364, 2373, 2382, 2388, 2397, 2409, 2415, 2418, 2427, 2433, 2445, 2448, 2454, 2457, 2460, 2460 };
+
+__constant__ int marchingCubesIds[2460] = {
+0, 8, 3, 0, 1, 9, 1, 8, 3, 9, 8, 1, 1, 2, 10, 0, 8, 3, 1, 2, 10, 9, 2, 10, 0, 2, 9, 2, 8, 3, 2,
+10, 8, 10, 9, 8, 3, 11, 2, 0, 11, 2, 8, 11, 0, 1, 9, 0, 2, 3, 11, 1, 11, 2, 1, 9, 11, 9, 8, 11, 3,
+10, 1, 11, 10, 3, 0, 10, 1, 0, 8, 10, 8, 11, 10, 3, 9, 0, 3, 11, 9, 11, 10, 9, 9, 8, 10, 10, 8, 11, 4,
+7, 8, 4, 3, 0, 7, 3, 4, 0, 1, 9, 8, 4, 7, 4, 1, 9, 4, 7, 1, 7, 3, 1, 1, 2, 10, 8, 4, 7, 3,
+4, 7, 3, 0, 4, 1, 2, 10, 9, 2, 10, 9, 0, 2, 8, 4, 7, 2, 10, 9, 2, 9, 7, 2, 7, 3, 7, 9, 4, 8,
+4, 7, 3, 11, 2, 11, 4, 7, 11, 2, 4, 2, 0, 4, 9, 0, 1, 8, 4, 7, 2, 3, 11, 4, 7, 11, 9, 4, 11, 9,
+11, 2, 9, 2, 1, 3, 10, 1, 3, 11, 10, 7, 8, 4, 1, 11, 10, 1, 4, 11, 1, 0, 4, 7, 11, 4, 4, 7, 8, 9,
+0, 11, 9, 11, 10, 11, 0, 3, 4, 7, 11, 4, 11, 9, 9, 11, 10, 9, 5, 4, 9, 5, 4, 0, 8, 3, 0, 5, 4, 1,
+5, 0, 8, 5, 4, 8, 3, 5, 3, 1, 5, 1, 2, 10, 9, 5, 4, 3, 0, 8, 1, 2, 10, 4, 9, 5, 5, 2, 10, 5,
+4, 2, 4, 0, 2, 2, 10, 5, 3, 2, 5, 3, 5, 4, 3, 4, 8, 9, 5, 4, 2, 3, 11, 0, 11, 2, 0, 8, 11, 4,
+9, 5, 0, 5, 4, 0, 1, 5, 2, 3, 11, 2, 1, 5, 2, 5, 8, 2, 8, 11, 4, 8, 5, 10, 3, 11, 10, 1, 3, 9,
+5, 4, 4, 9, 5, 0, 8, 1, 8, 10, 1, 8, 11, 10, 5, 4, 0, 5, 0, 11, 5, 11, 10, 11, 0, 3, 5, 4, 8, 5,
+8, 10, 10, 8, 11, 9, 7, 8, 5, 7, 9, 9, 3, 0, 9, 5, 3, 5, 7, 3, 0, 7, 8, 0, 1, 7, 1, 5, 7, 1,
+5, 3, 3, 5, 7, 9, 7, 8, 9, 5, 7, 10, 1, 2, 10, 1, 2, 9, 5, 0, 5, 3, 0, 5, 7, 3, 8, 0, 2, 8,
+2, 5, 8, 5, 7, 10, 5, 2, 2, 10, 5, 2, 5, 3, 3, 5, 7, 7, 9, 5, 7, 8, 9, 3, 11, 2, 9, 5, 7, 9,
+7, 2, 9, 2, 0, 2, 7, 11, 2, 3, 11, 0, 1, 8, 1, 7, 8, 1, 5, 7, 11, 2, 1, 11, 1, 7, 7, 1, 5, 9,
+5, 8, 8, 5, 7, 10, 1, 3, 10, 3, 11, 5, 7, 0, 5, 0, 9, 7, 11, 0, 1, 0, 10, 11, 10, 0, 11, 10, 0, 11,
+0, 3, 10, 5, 0, 8, 0, 7, 5, 7, 0, 11, 10, 5, 7, 11, 5, 10, 6, 5, 0, 8, 3, 5, 10, 6, 9, 0, 1, 5,
+10, 6, 1, 8, 3, 1, 9, 8, 5, 10, 6, 1, 6, 5, 2, 6, 1, 1, 6, 5, 1, 2, 6, 3, 0, 8, 9, 6, 5, 9,
+0, 6, 0, 2, 6, 5, 9, 8, 5, 8, 2, 5, 2, 6, 3, 2, 8, 2, 3, 11, 10, 6, 5, 11, 0, 8, 11, 2, 0, 10,
+6, 5, 0, 1, 9, 2, 3, 11, 5, 10, 6, 5, 10, 6, 1, 9, 2, 9, 11, 2, 9, 8, 11, 6, 3, 11, 6, 5, 3, 5,
+1, 3, 0, 8, 11, 0, 11, 5, 0, 5, 1, 5, 11, 6, 3, 11, 6, 0, 3, 6, 0, 6, 5, 0, 5, 9, 6, 5, 9, 6,
+9, 11, 11, 9, 8, 5, 10, 6, 4, 7, 8, 4, 3, 0, 4, 7, 3, 6, 5, 10, 1, 9, 0, 5, 10, 6, 8, 4, 7, 10,
+6, 5, 1, 9, 7, 1, 7, 3, 7, 9, 4, 6, 1, 2, 6, 5, 1, 4, 7, 8, 1, 2, 5, 5, 2, 6, 3, 0, 4, 3,
+4, 7, 8, 4, 7, 9, 0, 5, 0, 6, 5, 0, 2, 6, 7, 3, 9, 7, 9, 4, 3, 2, 9, 5, 9, 6, 2, 6, 9, 3,
+11, 2, 7, 8, 4, 10, 6, 5, 5, 10, 6, 4, 7, 2, 4, 2, 0, 2, 7, 11, 0, 1, 9, 4, 7, 8, 2, 3, 11, 5,
+10, 6, 9, 2, 1, 9, 11, 2, 9, 4, 11, 7, 11, 4, 5, 10, 6, 8, 4, 7, 3, 11, 5, 3, 5, 1, 5, 11, 6, 5,
+1, 11, 5, 11, 6, 1, 0, 11, 7, 11, 4, 0, 4, 11, 0, 5, 9, 0, 6, 5, 0, 3, 6, 11, 6, 3, 8, 4, 7, 6,
+5, 9, 6, 9, 11, 4, 7, 9, 7, 11, 9, 10, 4, 9, 6, 4, 10, 4, 10, 6, 4, 9, 10, 0, 8, 3, 10, 0, 1, 10,
+6, 0, 6, 4, 0, 8, 3, 1, 8, 1, 6, 8, 6, 4, 6, 1, 10, 1, 4, 9, 1, 2, 4, 2, 6, 4, 3, 0, 8, 1,
+2, 9, 2, 4, 9, 2, 6, 4, 0, 2, 4, 4, 2, 6, 8, 3, 2, 8, 2, 4, 4, 2, 6, 10, 4, 9, 10, 6, 4, 11,
+2, 3, 0, 8, 2, 2, 8, 11, 4, 9, 10, 4, 10, 6, 3, 11, 2, 0, 1, 6, 0, 6, 4, 6, 1, 10, 6, 4, 1, 6,
+1, 10, 4, 8, 1, 2, 1, 11, 8, 11, 1, 9, 6, 4, 9, 3, 6, 9, 1, 3, 11, 6, 3, 8, 11, 1, 8, 1, 0, 11,
+6, 1, 9, 1, 4, 6, 4, 1, 3, 11, 6, 3, 6, 0, 0, 6, 4, 6, 4, 8, 11, 6, 8, 7, 10, 6, 7, 8, 10, 8,
+9, 10, 0, 7, 3, 0, 10, 7, 0, 9, 10, 6, 7, 10, 10, 6, 7, 1, 10, 7, 1, 7, 8, 1, 8, 0, 10, 6, 7, 10,
+7, 1, 1, 7, 3, 1, 2, 6, 1, 6, 8, 1, 8, 9, 8, 6, 7, 2, 6, 9, 2, 9, 1, 6, 7, 9, 0, 9, 3, 7,
+3, 9, 7, 8, 0, 7, 0, 6, 6, 0, 2, 7, 3, 2, 6, 7, 2, 2, 3, 11, 10, 6, 8, 10, 8, 9, 8, 6, 7, 2,
+0, 7, 2, 7, 11, 0, 9, 7, 6, 7, 10, 9, 10, 7, 1, 8, 0, 1, 7, 8, 1, 10, 7, 6, 7, 10, 2, 3, 11, 11,
+2, 1, 11, 1, 7, 10, 6, 1, 6, 7, 1, 8, 9, 6, 8, 6, 7, 9, 1, 6, 11, 6, 3, 1, 3, 6, 0, 9, 1, 11,
+6, 7, 7, 8, 0, 7, 0, 6, 3, 11, 0, 11, 6, 0, 7, 11, 6, 7, 6, 11, 3, 0, 8, 11, 7, 6, 0, 1, 9, 11,
+7, 6, 8, 1, 9, 8, 3, 1, 11, 7, 6, 10, 1, 2, 6, 11, 7, 1, 2, 10, 3, 0, 8, 6, 11, 7, 2, 9, 0, 2,
+10, 9, 6, 11, 7, 6, 11, 7, 2, 10, 3, 10, 8, 3, 10, 9, 8, 7, 2, 3, 6, 2, 7, 7, 0, 8, 7, 6, 0, 6,
+2, 0, 2, 7, 6, 2, 3, 7, 0, 1, 9, 1, 6, 2, 1, 8, 6, 1, 9, 8, 8, 7, 6, 10, 7, 6, 10, 1, 7, 1,
+3, 7, 10, 7, 6, 1, 7, 10, 1, 8, 7, 1, 0, 8, 0, 3, 7, 0, 7, 10, 0, 10, 9, 6, 10, 7, 7, 6, 10, 7,
+10, 8, 8, 10, 9, 6, 8, 4, 11, 8, 6, 3, 6, 11, 3, 0, 6, 0, 4, 6, 8, 6, 11, 8, 4, 6, 9, 0, 1, 9,
+4, 6, 9, 6, 3, 9, 3, 1, 11, 3, 6, 6, 8, 4, 6, 11, 8, 2, 10, 1, 1, 2, 10, 3, 0, 11, 0, 6, 11, 0,
+4, 6, 4, 11, 8, 4, 6, 11, 0, 2, 9, 2, 10, 9, 10, 9, 3, 10, 3, 2, 9, 4, 3, 11, 3, 6, 4, 6, 3, 8,
+2, 3, 8, 4, 2, 4, 6, 2, 0, 4, 2, 4, 6, 2, 1, 9, 0, 2, 3, 4, 2, 4, 6, 4, 3, 8, 1, 9, 4, 1,
+4, 2, 2, 4, 6, 8, 1, 3, 8, 6, 1, 8, 4, 6, 6, 10, 1, 10, 1, 0, 10, 0, 6, 6, 0, 4, 4, 6, 3, 4,
+3, 8, 6, 10, 3, 0, 3, 9, 10, 9, 3, 10, 9, 4, 6, 10, 4, 4, 9, 5, 7, 6, 11, 0, 8, 3, 4, 9, 5, 11,
+7, 6, 5, 0, 1, 5, 4, 0, 7, 6, 11, 11, 7, 6, 8, 3, 4, 3, 5, 4, 3, 1, 5, 9, 5, 4, 10, 1, 2, 7,
+6, 11, 6, 11, 7, 1, 2, 10, 0, 8, 3, 4, 9, 5, 7, 6, 11, 5, 4, 10, 4, 2, 10, 4, 0, 2, 3, 4, 8, 3,
+5, 4, 3, 2, 5, 10, 5, 2, 11, 7, 6, 7, 2, 3, 7, 6, 2, 5, 4, 9, 9, 5, 4, 0, 8, 6, 0, 6, 2, 6,
+8, 7, 3, 6, 2, 3, 7, 6, 1, 5, 0, 5, 4, 0, 6, 2, 8, 6, 8, 7, 2, 1, 8, 4, 8, 5, 1, 5, 8, 9,
+5, 4, 10, 1, 6, 1, 7, 6, 1, 3, 7, 1, 6, 10, 1, 7, 6, 1, 0, 7, 8, 7, 0, 9, 5, 4, 4, 0, 10, 4,
+10, 5, 0, 3, 10, 6, 10, 7, 3, 7, 10, 7, 6, 10, 7, 10, 8, 5, 4, 10, 4, 8, 10, 6, 9, 5, 6, 11, 9, 11,
+8, 9, 3, 6, 11, 0, 6, 3, 0, 5, 6, 0, 9, 5, 0, 11, 8, 0, 5, 11, 0, 1, 5, 5, 6, 11, 6, 11, 3, 6,
+3, 5, 5, 3, 1, 1, 2, 10, 9, 5, 11, 9, 11, 8, 11, 5, 6, 0, 11, 3, 0, 6, 11, 0, 9, 6, 5, 6, 9, 1,
+2, 10, 11, 8, 5, 11, 5, 6, 8, 0, 5, 10, 5, 2, 0, 2, 5, 6, 11, 3, 6, 3, 5, 2, 10, 3, 10, 5, 3, 5,
+8, 9, 5, 2, 8, 5, 6, 2, 3, 8, 2, 9, 5, 6, 9, 6, 0, 0, 6, 2, 1, 5, 8, 1, 8, 0, 5, 6, 8, 3,
+8, 2, 6, 2, 8, 1, 5, 6, 2, 1, 6, 1, 3, 6, 1, 6, 10, 3, 8, 6, 5, 6, 9, 8, 9, 6, 10, 1, 0, 10,
+0, 6, 9, 5, 0, 5, 6, 0, 0, 3, 8, 5, 6, 10, 10, 5, 6, 11, 5, 10, 7, 5, 11, 11, 5, 10, 11, 7, 5, 8,
+3, 0, 5, 11, 7, 5, 10, 11, 1, 9, 0, 10, 7, 5, 10, 11, 7, 9, 8, 1, 8, 3, 1, 11, 1, 2, 11, 7, 1, 7,
+5, 1, 0, 8, 3, 1, 2, 7, 1, 7, 5, 7, 2, 11, 9, 7, 5, 9, 2, 7, 9, 0, 2, 2, 11, 7, 7, 5, 2, 7,
+2, 11, 5, 9, 2, 3, 2, 8, 9, 8, 2, 2, 5, 10, 2, 3, 5, 3, 7, 5, 8, 2, 0, 8, 5, 2, 8, 7, 5, 10,
+2, 5, 9, 0, 1, 5, 10, 3, 5, 3, 7, 3, 10, 2, 9, 8, 2, 9, 2, 1, 8, 7, 2, 10, 2, 5, 7, 5, 2, 1,
+3, 5, 3, 7, 5, 0, 8, 7, 0, 7, 1, 1, 7, 5, 9, 0, 3, 9, 3, 5, 5, 3, 7, 9, 8, 7, 5, 9, 7, 5,
+8, 4, 5, 10, 8, 10, 11, 8, 5, 0, 4, 5, 11, 0, 5, 10, 11, 11, 3, 0, 0, 1, 9, 8, 4, 10, 8, 10, 11, 10,
+4, 5, 10, 11, 4, 10, 4, 5, 11, 3, 4, 9, 4, 1, 3, 1, 4, 2, 5, 1, 2, 8, 5, 2, 11, 8, 4, 5, 8, 0,
+4, 11, 0, 11, 3, 4, 5, 11, 2, 11, 1, 5, 1, 11, 0, 2, 5, 0, 5, 9, 2, 11, 5, 4, 5, 8, 11, 8, 5, 9,
+4, 5, 2, 11, 3, 2, 5, 10, 3, 5, 2, 3, 4, 5, 3, 8, 4, 5, 10, 2, 5, 2, 4, 4, 2, 0, 3, 10, 2, 3,
+5, 10, 3, 8, 5, 4, 5, 8, 0, 1, 9, 5, 10, 2, 5, 2, 4, 1, 9, 2, 9, 4, 2, 8, 4, 5, 8, 5, 3, 3,
+5, 1, 0, 4, 5, 1, 0, 5, 8, 4, 5, 8, 5, 3, 9, 0, 5, 0, 3, 5, 9, 4, 5, 4, 11, 7, 4, 9, 11, 9,
+10, 11, 0, 8, 3, 4, 9, 7, 9, 11, 7, 9, 10, 11, 1, 10, 11, 1, 11, 4, 1, 4, 0, 7, 4, 11, 3, 1, 4, 3,
+4, 8, 1, 10, 4, 7, 4, 11, 10, 11, 4, 4, 11, 7, 9, 11, 4, 9, 2, 11, 9, 1, 2, 9, 7, 4, 9, 11, 7, 9,
+1, 11, 2, 11, 1, 0, 8, 3, 11, 7, 4, 11, 4, 2, 2, 4, 0, 11, 7, 4, 11, 4, 2, 8, 3, 4, 3, 2, 4, 2,
+9, 10, 2, 7, 9, 2, 3, 7, 7, 4, 9, 9, 10, 7, 9, 7, 4, 10, 2, 7, 8, 7, 0, 2, 0, 7, 3, 7, 10, 3,
+10, 2, 7, 4, 10, 1, 10, 0, 4, 0, 10, 1, 10, 2, 8, 7, 4, 4, 9, 1, 4, 1, 7, 7, 1, 3, 4, 9, 1, 4,
+1, 7, 0, 8, 1, 8, 7, 1, 4, 0, 3, 7, 4, 3, 4, 8, 7, 9, 10, 8, 10, 11, 8, 3, 0, 9, 3, 9, 11, 11,
+9, 10, 0, 1, 10, 0, 10, 8, 8, 10, 11, 3, 1, 10, 11, 3, 10, 1, 2, 11, 1, 11, 9, 9, 11, 8, 3, 0, 9, 3,
+9, 11, 1, 2, 9, 2, 11, 9, 0, 2, 11, 8, 0, 11, 3, 2, 11, 2, 3, 8, 2, 8, 10, 10, 8, 9, 9, 10, 2, 0,
+9, 2, 2, 3, 8, 2, 8, 10, 0, 1, 8, 1, 10, 8, 1, 10, 2, 1, 3, 8, 9, 1, 8, 0, 9, 1, 0, 3, 8 };
+
+__constant__ int marchingCubesEdgeLocations[12][4] = {
+	// relative cell coords, edge within cell
+	{0, 0, 0,  0},
+	{1, 0, 0,  1},
+	{0, 1, 0,  0},
+	{0, 0, 0,  1},
+
+	{0, 0, 1,  0},
+	{1, 0, 1,  1},
+	{0, 1, 1,  0},
+	{0, 0, 1,  1},
+
+	{0, 0, 0,  2},
+	{1, 0, 0,  2},
+	{1, 1, 0,  2},
+	{0, 1, 0,  2}
+};
--- a/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/matrixDecomposition.cuh
+++ b/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/matrixDecomposition.cuh
@@ -0,0 +1,177 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __CU_MATRIXDECOMPOSITION_CUH__
+#define __CU_MATRIXDECOMPOSITION_CUH__
+
+#include "foundation/PxMat33.h"
+
+namespace physx {
+
+// Eigen decomposition code thanks to Matthias Mueller-Fischer!
+template<int p, int q, int k> __device__
+inline void jacobiRotateT(float* A, float* R)
+{
+	const int pq_index = 3 * q + p;
+	const int qp_index = 3 * p + q;
+	const int pp_index = 3 * p + p;
+	const int qq_index = 3 * q + q;
+
+	// rotates A through phi in pq-plane to set A(p,q) = 0
+	// rotation stored in R whose columns are eigenvectors of A
+	if (A[pq_index] == 0.0f)
+		return;
+
+	float d = __fdividef(A[pp_index] - A[qq_index], 2.0f*A[pq_index]);
+	float dSqPlus1 = d * d + 1.0f;
+	float t = __fdividef(1.0f, fabs(d) + sqrtf(dSqPlus1));
+	t = copysign(t, d);
+	float c = 1.0f * rsqrtf(t*t + 1.0f);
+	float s = t * c;
+
+	A[pp_index] += t * A[pq_index];
+	A[qq_index] -= t * A[pq_index];
+	A[pq_index] = A[qp_index] = 0.0f;
+
+	// transform A
+	const int kp = p * 3 + k;
+	const int kq = q * 3 + k;
+	const int pk = k * 3 + p;
+	const int qk = k * 3 + q;
+
+	float Akp = c * A[kp] + s * A[kq];
+	float Akq = -s * A[kp] + c * A[kq];
+	A[kp] = A[pk] = Akp;
+	A[kq] = A[qk] = Akq;
+
+	// store rotation in R (loop unrolled for k = 0,1,2)
+	// k = 0
+	const int kp0 = p * 3 + 0;
+	const int kq0 = q * 3 + 0;
+
+	float Rkp0 = c * R[kp0] + s * R[kq0];
+	float Rkq0 = -s * R[kp0] + c * R[kq0];
+	R[kp0] = Rkp0;
+	R[kq0] = Rkq0;
+
+	// k = 1
+	const int kp1 = p * 3 + 1;
+	const int kq1 = q * 3 + 1;
+
+	float Rkp1 = c * R[kp1] + s * R[kq1];
+	float Rkq1 = -s * R[kp1] + c * R[kq1];
+	R[kp1] = Rkp1;
+	R[kq1] = Rkq1;
+
+	// k = 2
+	const int kp2 = p * 3 + 2;
+	const int kq2 = q * 3 + 2;
+
+	float Rkp2 = c * R[kp2] + s * R[kq2];
+	float Rkq2 = -s * R[kp2] + c * R[kq2];
+	R[kp2] = Rkp2;
+	R[kq2] = Rkq2;
+}
+
+__device__
+inline void jacobiRotate(PxMat33 &A, PxMat33 &R, int p, int q)
+{
+	// rotates A through phi in pq-plane to set A(p,q) = 0
+	// rotation stored in R whose columns are eigenvectors of A
+	if (A(p, q) == 0.0f)
+		return;
+
+	float d = (A(p, p) - A(q, q)) / (2.0f*A(p, q));
+	float t = 1.0f / (fabs(d) + sqrtf(d*d + 1.0f));
+	if (d < 0.0f) t = -t;
+	float c = 1.0f / sqrtf(t*t + 1.0f);
+	float s = t * c;
+	A(p, p) += t * A(p, q);
+	A(q, q) -= t * A(p, q);
+	A(p, q) = A(q, p) = 0.0f;
+
+	// transform A
+	int k;
+	for (k = 0; k < 3; k++) {
+		if (k != p && k != q) {
+			float Akp = c * A(k, p) + s * A(k, q);
+			float Akq = -s * A(k, p) + c * A(k, q);
+			A(k, p) = A(p, k) = Akp;
+			A(k, q) = A(q, k) = Akq;
+		}
+	}
+
+	// store rotation in R
+	for (k = 0; k < 3; k++) {
+		float Rkp = c * R(k, p) + s * R(k, q);
+		float Rkq = -s * R(k, p) + c * R(k, q);
+		R(k, p) = Rkp;
+		R(k, q) = Rkq;
+	}
+}
+
+__device__
+inline void eigenDecomposition(PxMat33 &A, PxMat33 &R, int numJacobiIterations = 4)
+{
+	const float epsilon = 1e-15f;
+
+	// only for symmetric matrices!
+	R = PxMat33(PxVec3(1.f, 0.f, 0.f), PxVec3(0.f, 1.f, 0.f), PxVec3(0.f, 0.f, 1.f));
+
+	float* fA = static_cast<float*>(&A(0, 0));
+	float* fR = static_cast<float*>(&R(0, 0));
+
+#define USE_FAST_JACOBI 1
+
+	for (int i = 0; i < numJacobiIterations; i++)
+	{// 3 off diagonal elements
+		// find off diagonal element with maximum modulus
+		int j = 0;
+		float max = fabs(A(0, 1));
+		float a = fabs(A(0, 2));
+		if (a > max) { j = 1; max = a; }
+		a = fabs(A(1, 2));
+		if (a > max) { j = 2; max = a; }
+
+		// all small enough -> done
+		if (max < epsilon) break;
+
+#if USE_FAST_JACOBI
+		// rotate matrix with respect to that element
+		if (j == 0) jacobiRotateT<0, 1, 2>(fA, fR);
+		else if (j == 1) jacobiRotateT<0, 2, 1>(fA, fR);
+		else jacobiRotateT<1, 2, 0>(fA, fR);
+#else
+		jacobiRotate(A, R, p, q);
+#endif
+	}
+}
+
+}
+
+#endif  // __CU_MATRIXDECOMPOSITION_CUH__
--- a/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/particleSystem.cuh
+++ b/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/particleSystem.cuh
@@ -0,0 +1,121 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __PARTICLE_SYSTEM_CUH__
+#define __PARTICLE_SYSTEM_CUH__
+
+#include "foundation/PxVec3.h"
+#include "foundation/PxVec4.h"
+#include "utils.cuh"
+#include "reduction.cuh"
+#include "PxParticleGpu.h"
+#include "PxSparseGridParams.h"
+#include "PxgParticleSystem.h"
+
+namespace physx
+{
+	__device__ inline PxVec3 getSubgridDomainSize(const PxSparseGridParams& params)
+	{
+		const PxReal dx = params.gridSpacing;
+		return PxVec3(dx * (params.subgridSizeX - 2 * params.haloSize), dx * (params.subgridSizeY - 2 * params.haloSize), dx * (params.subgridSizeZ - 2 * params.haloSize));
+	}
+
+	__device__ inline bool tryFindSubgridHashkey(
+		const PxU32* const PX_RESTRICT		sortedHashkey,
+		const PxU32							numSubgrids,
+		const PxU32							hashToFind,
+		PxU32&								result)
+	{
+		result = binarySearch(sortedHashkey, numSubgrids, hashToFind);
+		return sortedHashkey[result] == hashToFind;
+	}
+	
+	__device__ inline float sqr(PxReal x) { return x * x; }
+
+	__device__ inline float W(const PxReal x, const PxReal kSpiky1, const PxReal kInvRadius)
+	{
+		return kSpiky1 * sqr(1.0f - x * kInvRadius);
+	}
+
+	__device__ inline float dWdx(const PxReal x, const PxReal kSpiky2, const PxReal kInvRadius)
+	{
+		return -kSpiky2 * (1.0f - x * kInvRadius);
+	}
+
+	// aerodynamics model in Frozen
+	PX_FORCE_INLINE __device__ PxVec3 disneyWindModelEffect(const PxVec3& x0, const PxVec3& x1, const PxVec3& x2, const PxVec3& vel0, const PxVec3& vel1, const PxVec3& vel2, 
+		const PxVec3& wind, PxReal inverseMass, PxReal drag, PxReal lift, PxReal dt, PxReal airDensity)
+	{
+		const PxVec3 x01 = x1 - x0;
+		const PxVec3 x02 = x2 - x0;
+		PxVec3 n = x01.cross(x02);
+
+		// airDensity: 1.225 kg / m3, reference: https://en.wikipedia.org/wiki/Density_of_air
+		const PxVec3 v = (vel0 + vel1 + vel2) * 0.3333f;
+		const PxVec3 vrel = wind - v;
+
+		if(vrel.dot(n) < 0.f)
+		{
+			n *= -1.f;
+		}
+
+		// option 1. using current (deformed) triangle area
+		const PxReal coef = 0.25f * airDensity * dt * inverseMass;
+
+		//// optoin 2. using rest (undeformed) triangle area
+		// const PxReal area = femCloth.mTriangleAreas[triIndex];
+		// n.normalize();
+		// const PxReal coef = 0.5 * shFEMCloth.mAirDensity * area * dt * inverseMass;
+
+		return coef * ((drag - lift) * vrel.dot(n) * vrel + lift * vrel.magnitudeSquared() * n);
+	}
+
+	// finds the bufferIndex for a given UniqueID.
+	static __device__ PxU32 findBufferIndexFromUniqueId(const PxgParticleSystem& particleSystem, PxU32 uniqueBufferId)
+	{
+		const PxU32 length = particleSystem.mCommonData.mNumParticleBuffers;
+		if (length == 0)
+			return 0;
+
+		const PxU32* values = particleSystem.mParticleBufferSortedUniqueIds;	
+
+		PxU32 l = 0, r = length;
+		while (l < r)
+		{
+			PxU32 m = (l + r) / 2;
+			if (values[m] > uniqueBufferId)
+				r = m;
+			else
+				l = m + 1;
+		}
+		return particleSystem.mParticleBufferSortedUniqueIdsOriginalIndex[r - 1];
+	}
+
+}
+
+#endif
--- a/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/particlesystem.cu
+++ b/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/particlesystem.cu
--- a/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/rigidDeltaAccum.cu
+++ b/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/rigidDeltaAccum.cu
@@ -0,0 +1,875 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "vector_types.h"
+#include "foundation/PxVec3.h"
+#include "cutil_math.h"
+#include "PxgParticleSystemCoreKernelIndices.h"
+#include "reduction.cuh"
+#include "shuffle.cuh"
+#include "PxgSolverCoreDesc.h"
+#include "PxNodeIndex.h"
+#include "assert.h"
+#include "PxgSimulationCoreDesc.h"
+#include "PxgArticulationCoreDesc.h"
+
+using namespace physx;
+
+extern "C" __host__ void initSimulationControllerKernels2() {}
+
+/*
+ * This kernel takes a *sorted* list of PxNodeIndex (of size *numContacts), 
+ * and a corresponding deltaV list.
+ * 
+ * deltaV is updated with the cumulativ delta values, such that for each last entry of a rigid body deltaV is the total sum 
+ * for that rigid body - *however* only for the rigid body entries which are processed within one block.
+ * blockRigidId, blockDeltaV are updated, such that entry represents the deltaV sum and rigid body ID, of the last
+ * occuring rigid body in that corresponding block. This can then be used in the subsequent kernel to complete the sum 
+ * for rigid bodies, which entries are overlapping a block.
+ */
+extern "C" __global__ void accumulateDeltaVRigidFirstLaunch(
+	const PxU64*					sortedRigidIds,			//input
+	const PxU32*					numContacts,			//input
+	float4*							deltaV,					//input/output
+	float4*							blockDeltaV,			//output
+	PxU64*							blockRigidId			//output
+)
+{
+	__shared__ PxU64 sRigidId[PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA + 1];
+
+	//numWarpsPerBlock can't be larger than 32
+	const PxU32 numWarpsPerBlock = PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA / WARP_SIZE;
+
+	__shared__ float4 sLinWarpAccumulator[WARP_SIZE];
+	__shared__ float4 sAngWarpAccumulator[WARP_SIZE];
+	__shared__ PxU64  sWarpRigidId[WARP_SIZE];
+	__shared__ float4 sLinBlockAccumulator;
+	__shared__ float4 sAngBlockAccumulator;
+	__shared__ PxU64 sBlockRigidId;
+
+	const PxU32 tNumContacts = *numContacts;
+	const PxU32 nbBlocksRequired = (tNumContacts + blockDim.x - 1) / blockDim.x;
+	const PxU32 nbIterationsPerBlock = (nbBlocksRequired + gridDim.x - 1) / gridDim.x;
+	const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE - 1);
+	const PxU32 warpIndex = threadIdx.x / WARP_SIZE;
+
+	if (threadIdx.x < 4)
+	{
+		float* tLinBlockAccumulator = reinterpret_cast<float*>(&sLinBlockAccumulator.x);
+		tLinBlockAccumulator[threadIdx.x] = 0.f;
+
+		float* tAngBlockAccumulator = reinterpret_cast<float*>(&sAngBlockAccumulator.x);
+		tAngBlockAccumulator[threadIdx.x] = 0.f;
+
+		if (threadIdx.x == 0)
+			sBlockRigidId = 0x8fffffffffffffff;
+	}
+	__syncthreads();
+
+	for (PxU32 i = 0; i < nbIterationsPerBlock; ++i)
+	{
+		const PxU32 workIndex = blockDim.x*(blockIdx.x*nbIterationsPerBlock + i) + threadIdx.x;
+
+		PxU64 rigidId = 0x8fffffffffffffff;
+		float4 linDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
+		float4 angDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
+		if (workIndex < tNumContacts)
+		{
+			rigidId = sortedRigidIds[workIndex];
+			sRigidId[threadIdx.x] = rigidId;
+			linDeltaV = deltaV[workIndex];
+			angDeltaV = deltaV[workIndex + tNumContacts];
+		}
+		__syncthreads();
+
+		for (PxU32 reductionRadius = 1; reductionRadius < WARP_SIZE; reductionRadius <<= 1)
+		{
+			const PxU32 lane = threadIndexInWarp - reductionRadius;
+
+			float4 linVal = shuffle(FULL_MASK, linDeltaV, lane);
+			float4 angVal = shuffle(FULL_MASK, angDeltaV, lane);
+
+			//workIndex < tNumContacts guarantees that sRigidId[WARP_SIZE * warpIndex + lane]
+			//always points to initialized memory, since lane is always smaller than threadIndexInWarp.
+			if (threadIndexInWarp >= reductionRadius && workIndex < tNumContacts && 
+				rigidId == sRigidId[WARP_SIZE * warpIndex + lane])
+			{
+				linDeltaV += linVal;
+				angDeltaV += angVal;
+			}
+		}
+
+		if (threadIndexInWarp == (WARP_SIZE - 1))
+		{
+			sLinWarpAccumulator[warpIndex] = linDeltaV;
+			sAngWarpAccumulator[warpIndex] = angDeltaV;
+			sWarpRigidId[warpIndex] = rigidId;
+		}
+
+		const float4 prevLinBlockAccumulator = sLinBlockAccumulator;
+		const float4 prevAngBlockAccumulator = sAngBlockAccumulator;
+		const PxU64 prevBlockRigidId = sBlockRigidId;
+
+		//Don't allow write until we've finished all reading...
+		__syncthreads();
+
+		if (warpIndex == 0)
+		{
+			float4 linDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
+			float4 angDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
+
+			PxU64 warpRigidId = 0x8fffffffffffffff;
+			if (threadIndexInWarp < numWarpsPerBlock)
+			{
+				linDeltaV = sLinWarpAccumulator[threadIndexInWarp];
+				angDeltaV = sAngWarpAccumulator[threadIndexInWarp];
+				warpRigidId = sWarpRigidId[threadIndexInWarp];
+			}
+
+			float4 tLinDeltaV = linDeltaV;
+			float4 tAngDeltaV = angDeltaV;
+
+			for (PxU32 reductionRadius = 1; reductionRadius < numWarpsPerBlock; reductionRadius <<= 1)
+			{
+				const PxU32 lane = threadIndexInWarp - reductionRadius;
+				float4 linVal = shuffle(FULL_MASK, tLinDeltaV, lane);
+				float4 angVal = shuffle(FULL_MASK, tAngDeltaV, lane);
+
+				if (threadIndexInWarp >= reductionRadius && warpRigidId == sWarpRigidId[lane])
+				{
+					tLinDeltaV += linVal;
+					tAngDeltaV += angVal;
+				}
+			}
+
+			if (threadIndexInWarp == (numWarpsPerBlock - 1))
+			{
+				if (sBlockRigidId != warpRigidId)
+				{
+					//need to clear block accumulators in case previous iteration
+					//stored other sBlockRigidId
+					sLinBlockAccumulator = make_float4(0.f, 0.f, 0.f, 0.f);
+					sAngBlockAccumulator = make_float4(0.f, 0.f, 0.f, 0.f);
+				}
+				sLinBlockAccumulator += tLinDeltaV;
+				sAngBlockAccumulator += tAngDeltaV;
+				sBlockRigidId = warpRigidId;
+			}
+
+			sLinWarpAccumulator[threadIndexInWarp] = tLinDeltaV;
+			sAngWarpAccumulator[threadIndexInWarp] = tAngDeltaV;
+		}
+		__syncthreads();
+
+		if (workIndex < tNumContacts)
+		{
+			float4 accumLin = make_float4(0.f, 0.f, 0.f, 0.f);
+			float4 accumAng = make_float4(0.f, 0.f, 0.f, 0.f);
+
+			//if rigidId and the previous element rigid Id is the same, we need to add the previous warp accumulate velocity to
+			//the current rigid body
+			if (warpIndex > 0 && rigidId == sWarpRigidId[warpIndex - 1])
+			{
+				accumLin = sLinWarpAccumulator[warpIndex - 1];
+				accumAng = sAngWarpAccumulator[warpIndex - 1];
+			}
+
+			if (i != 0 && rigidId == prevBlockRigidId)
+			{
+				accumLin += prevLinBlockAccumulator;
+				accumAng += prevAngBlockAccumulator;
+			}
+
+			//Now output both offsets...
+			deltaV[workIndex] = linDeltaV + accumLin;
+			deltaV[workIndex + tNumContacts] = angDeltaV + accumAng;
+		}
+	}
+
+	if (threadIdx.x == 0)
+	{
+		blockDeltaV[blockIdx.x] = sLinBlockAccumulator;
+		blockDeltaV[blockIdx.x + PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA] = sAngBlockAccumulator;
+		blockRigidId[blockIdx.x] = sBlockRigidId;
+	}
+}
+
+
+//32 blocks. Each block compute the exclusive ransum for the blockOffset
+extern "C" __global__ void accumulateDeltaVRigidSecondLaunch(
+	const PxU64*								sortedRigidIds,			//input
+	const PxU32*								numContacts,			//input
+	const float4*								deltaV,					//input
+	const float4*								blockDeltaV,			//input
+	const PxU64*								blockRigidId,			//input
+	PxgPrePrepDesc*								prePrepDesc,
+	PxgSolverCoreDesc*							solverCoreDesc,
+	PxgArticulationCoreDesc*					artiCoreDesc,
+	PxgSolverSharedDesc<IterativeSolveData>*	sharedDesc,
+	const bool isTGS
+)
+{
+	__shared__ float4 sBlockLinDeltaV[PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
+	__shared__ float4 sBlockAngDeltaV[PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
+	__shared__ PxU64 sBlockRigidId[PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
+	__shared__ PxU64 sRigidId[PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA + 1];
+
+	const PxU32 tNumContacts = *numContacts;
+	const PxU32 nbBlocksRequired = (tNumContacts + blockDim.x - 1) / blockDim.x;
+	const PxU32 nbIterationsPerBlock = (nbBlocksRequired + gridDim.x - 1) / gridDim.x;
+	const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE - 1);
+
+	float4 linBlockDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
+	float4 angBlockDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
+
+	PxU64 tBlockRigidId = 0x8fffffffffffffff;
+	if (threadIdx.x < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA)
+	{
+		linBlockDeltaV = blockDeltaV[threadIdx.x];
+		angBlockDeltaV = blockDeltaV[threadIdx.x + PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
+		tBlockRigidId = blockRigidId[threadIdx.x];
+
+		sBlockLinDeltaV[threadIdx.x] = linBlockDeltaV;
+		sBlockAngDeltaV[threadIdx.x] = angBlockDeltaV;
+		sBlockRigidId[threadIdx.x] = tBlockRigidId;
+	}
+
+	__syncthreads(); //sBlockRigidId is written above and read below
+
+	float4 tLinDeltaV = linBlockDeltaV;
+	float4 tAngDeltaV = angBlockDeltaV;
+	//add on block deltaV if blockRigid id match
+	for (PxU32 reductionRadius = 1; reductionRadius < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA; reductionRadius <<= 1)
+	{
+		const PxU32 lane = threadIndexInWarp - reductionRadius;
+		float4 linVal = shuffle(FULL_MASK, tLinDeltaV, lane);
+		float4 angVal = shuffle(FULL_MASK, tAngDeltaV, lane);
+
+		if (threadIndexInWarp >= reductionRadius && tBlockRigidId == sBlockRigidId[lane])
+		{
+			tLinDeltaV += linVal;
+			tAngDeltaV += angVal;
+		}
+	}
+
+	__syncthreads(); //sBlockRigidId is read above and written below
+
+	if (threadIdx.x < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA)
+	{
+		sBlockLinDeltaV[threadIdx.x] = tLinDeltaV;
+		sBlockAngDeltaV[threadIdx.x] = tAngDeltaV;
+		sBlockRigidId[threadIdx.x] = blockRigidId[threadIdx.x];
+	}
+
+	__syncthreads();
+
+	float4* solverBodyDeltaVel = sharedDesc->iterativeData.solverBodyVelPool + solverCoreDesc->accumulatedBodyDeltaVOffset;
+	//float4* initialVel = solverCoreDesc->outSolverVelocity;
+	const PxU32 numSolverBodies = solverCoreDesc->numSolverBodies;
+
+	PxgArticulationBlockData* artiData = artiCoreDesc->mArticulationBlocks;
+	PxgArticulationBlockLinkData* artiLinkData = artiCoreDesc->mArticulationLinkBlocks;
+
+	const PxU32 maxLinks = artiCoreDesc->mMaxLinksPerArticulation;
+	
+	for (PxU32 i = 0; i < nbIterationsPerBlock; ++i)
+	{
+		__syncthreads(); //sRigidId is read and written in the same loop - read and write must be separated by syncs
+
+		const PxU32 workIndex = blockDim.x * (blockIdx.x * nbIterationsPerBlock + i) + threadIdx.x;
+
+		PxU64 rigidId = 0x8fffffffffffffff;
+		if (workIndex < tNumContacts)
+		{
+			rigidId = sortedRigidIds[workIndex];
+			if (threadIdx.x > 0)
+				sRigidId[threadIdx.x - 1] = rigidId;
+
+			if (workIndex == tNumContacts - 1)
+			{
+				sRigidId[threadIdx.x] = 0x8fffffffffffffff;
+			}
+			else if (threadIdx.x == PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA - 1)
+			{
+				// first thread in block must load neighbor particle 
+				sRigidId[threadIdx.x] = sortedRigidIds[workIndex + 1];
+			}
+		}
+
+		__syncthreads();
+
+		if (workIndex < tNumContacts)
+		{
+			float4 accumLin = make_float4(0.f, 0.f, 0.f, 0.f);
+			float4 accumAng = make_float4(0.f, 0.f, 0.f, 0.f);
+
+			if (rigidId != sRigidId[threadIdx.x])
+			{
+				float4 linVel = deltaV[workIndex];
+				float4 angVel = deltaV[workIndex + tNumContacts];
+
+				PxU64 preBlockRigidId = blockIdx.x > 0 ? sBlockRigidId[blockIdx.x - 1] : 0x8fffffffffffffff;
+
+				if (rigidId == preBlockRigidId)
+				{
+					linVel += sBlockLinDeltaV[blockIdx.x - 1];
+					angVel += sBlockAngDeltaV[blockIdx.x - 1];
+				}
+
+				//nodeIndex
+				const PxNodeIndex nodeId = reinterpret_cast<PxNodeIndex&>(rigidId);
+
+				PxU32 solverBodyIndex = 0;
+
+				if (!nodeId.isStaticBody())
+				{
+					PxU32 nodeIndex = nodeId.index();
+					solverBodyIndex = prePrepDesc->solverBodyIndices[nodeIndex];
+
+					if (nodeId.isArticulation())
+					{
+						//solverBodyIndex is the globalThreadIndex for the active articulation in the block format
+						const PxU32 blockIndex = solverBodyIndex / WARP_SIZE;
+
+						PxgArticulationBlockData& articulation = artiData[blockIndex];
+						PxgArticulationBlockLinkData* artiLinks = &artiLinkData[blockIndex * maxLinks];
+						
+						
+						const PxU32 artiIndexInBlock = solverBodyIndex % WARP_SIZE;
+
+						articulation.mStateDirty[artiIndexInBlock] = PxgArtiStateDirtyFlag::eHAS_IMPULSES;
+
+						const PxU32 linkID = nodeId.articulationLinkId();
+
+						const PxReal denom = PxMax(1.0f, linVel.w);
+						PxReal ratio = 1.f / denom;
+						linVel.w = 0.f;
+
+						//for articulation, linVel and angVel accumulate impulse
+						Cm::UnAlignedSpatialVector impulse;
+						impulse.top = PxVec3(linVel.x, linVel.y, linVel.z );
+						impulse.bottom = PxVec3(angVel.x, angVel.y, angVel.z);
+
+						impulse.top *= ratio;
+						impulse.bottom *= ratio;
+						
+						/*printf("blockIndex %i artiIndexInBlock %i linkID %i ratio %f impulse linear(%f, %f, %f) angular(%f, %f, %f)\n", blockIndex, artiIndexInBlock, linkID, ratio,
+							impulse.top.x, impulse.top.y, impulse.top.z, impulse.bottom.x, impulse.bottom.y, impulse.bottom.z);*/
+					
+						storeSpatialVector(artiLinks[linkID].mScratchImpulse, -impulse, artiIndexInBlock);
+					}
+					else
+					{
+						float4 linearVelocity = solverBodyDeltaVel[solverBodyIndex];
+						float4 angularVelocity = solverBodyDeltaVel[solverBodyIndex + numSolverBodies];
+
+						const PxReal denom = PxMax(1.0f, linVel.w);
+						PxReal ratio = 1.f / denom;
+						linVel.w = 0.f;
+
+						if (isTGS)
+						{
+							linearVelocity.x += linVel.x * ratio;
+							linearVelocity.y += linVel.y * ratio;
+							linearVelocity.z += linVel.z * ratio;
+							linearVelocity.w += angVel.x * ratio;
+							angularVelocity.x += angVel.y * ratio;
+							angularVelocity.y += angVel.z * ratio;
+							//The rest is the delta position buffer
+						}
+						else
+						{
+							/*assert(PxIsFinite(linVel.x)); assert(PxIsFinite(linVel.y)); assert(PxIsFinite(linVel.z));
+							assert(PxIsFinite(angVel.x)); assert(PxIsFinite(angVel.y)); assert(PxIsFinite(angVel.z));
+							assert(PxIsFinite(ratio));*/
+
+							/*printf("Accum linVelDelta = (%f, %f, %f, %f), angVelDelta = (%f, %f, %f, %f), ratio = %f, denom = %f, globalRelax = %f\n",
+								linVel.x, linVel.y, linVel.z, linVel.w, angVel.x, angVel.y, angVel.z, angVel.w, ratio, denom, globalRelaxationCoefficient);*/
+
+							linearVelocity += linVel * ratio;
+							angularVelocity += angVel * ratio;
+						}
+
+						solverBodyDeltaVel[solverBodyIndex] = linearVelocity;
+						solverBodyDeltaVel[solverBodyIndex + numSolverBodies] = angularVelocity;
+
+					}
+					//printf("solverBodyIndex %i\n", solverBodyIndex);
+					//printf("linearVelocity(%f, %f, %f, %f)\n", linearVelocity.x, linearVelocity.y, linearVelocity.z, linearVelocity.w);
+					//printf("angularVelocity(%f, %f, %f, %f)\n", angularVelocity.x, angularVelocity.y, angularVelocity.z, angularVelocity.w);
+				}
+			}
+		}
+	}
+}
+
+
+//32 blocks. Each block compute the exclusive ransum for the blockOffset
+extern "C" __global__ void clearDeltaVRigidSecondLaunchMulti(
+	PxU64*							sortedRigidIds,			//input
+	PxU32*							numContacts,			//input
+	PxgPrePrepDesc*					prePrepDesc,
+	PxgSolverCoreDesc*				solverCoreDesc,
+	PxgArticulationCoreDesc*		artiCoreDesc,
+	PxReal*							tempDenom
+)
+{
+	__shared__ PxU64 sRigidId[PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA + 1];
+
+	const PxU32 tNumContacts = *numContacts;
+	const PxU32 idx = threadIdx.x;
+
+	const PxU32 totalBlockRequired = (tNumContacts + (PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA - 1)) / PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA;
+	const PxU32 numIterationPerBlock = (totalBlockRequired + (PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA - 1)) / PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA;
+
+	PxgArticulationBlockLinkData* artiLinkData = artiCoreDesc->mArticulationLinkBlocks;
+
+	const PxU32 maxLinks = artiCoreDesc->mMaxLinksPerArticulation;
+
+	for (PxU32 i = 0; i < numIterationPerBlock; ++i)
+	{
+		const PxU32 workIndex = i * PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA + idx + numIterationPerBlock * blockIdx.x * blockDim.x;
+
+		PxU64 rigidId = 0x8fffffffffffffff;
+		if (workIndex < tNumContacts)
+		{
+			rigidId = sortedRigidIds[workIndex];
+			if (idx > 0)
+				sRigidId[idx - 1] = rigidId;
+
+			if (workIndex == tNumContacts - 1)
+			{
+				sRigidId[idx] = 0x8fffffffffffffff;
+			}
+			else if (threadIdx.x == PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA - 1)
+			{
+				// first thread in block must load neighbor particle 
+				sRigidId[idx] = sortedRigidIds[workIndex + 1];
+			}
+
+
+		}
+
+		__syncthreads();
+
+		if (workIndex < tNumContacts)
+		{
+			if (rigidId != sRigidId[idx])
+			{
+
+				//nodeIndex
+				const PxNodeIndex nodeId = reinterpret_cast<PxNodeIndex&>(rigidId);
+
+				PxU32 solverBodyIndex = 0;
+
+				if (!nodeId.isStaticBody())
+				{
+					PxU32 nodeIndex = nodeId.index();
+					solverBodyIndex = prePrepDesc->solverBodyIndices[nodeIndex];
+
+					if (nodeId.isArticulation())
+					{
+						//solverBodyIndex is the globalThreadIndex for the active articulation in the block format
+						const PxU32 blockIndex = solverBodyIndex / WARP_SIZE;
+
+						PxgArticulationBlockLinkData* artiLinks = &artiLinkData[blockIndex * maxLinks];
+
+						const PxU32 artiIndexInBlock = solverBodyIndex % WARP_SIZE;
+
+						const PxU32 linkID = nodeId.articulationLinkId();
+
+						artiLinks[linkID].mDeltaScale[artiIndexInBlock] = 0.f;
+					}
+					else
+					{
+						tempDenom[solverBodyIndex] = 0.f;
+					}
+				}
+			}
+		}
+	}
+}
+
+
+
+//32 blocks. Each block compute the exclusive ransum for the blockOffset
+extern "C" __global__ void accumulateDeltaVRigidSecondLaunchMultiStage1(
+	PxU64*							sortedRigidIds,			//input
+	PxU32*							numContacts,			//input
+	float4*							deltaV,					//input
+	float4*							blockDeltaV,			//input
+	PxU64*							blockRigidId,			//input
+	PxgPrePrepDesc*								prePrepDesc,
+	PxgSolverCoreDesc*							solverCoreDesc,
+	PxgArticulationCoreDesc*					artiCoreDesc,
+	PxgSolverSharedDesc<IterativeSolveData>*	sharedDesc,
+	PxReal*							tempDenom,
+	const bool useLocalRelax,
+	const float globalRelaxationCoefficient,
+	bool isTGS
+)
+{
+	__shared__ PxReal sBlockLinDeltaVW[PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
+	__shared__ PxU64 sBlockRigidId[PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
+	__shared__ PxU64 sRigidId[PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA + 1];
+
+	const PxU32 tNumContacts = *numContacts;
+	const PxU32 idx = threadIdx.x;
+
+	const PxU32 threadIndexInWarp = idx & (WARP_SIZE - 1);
+
+	const PxU32 totalBlockRequired = (tNumContacts + (PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA - 1)) / PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA;
+
+	float4 linBlockDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
+	float4 angBlockDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
+
+	PxU64 tBlockRigidId = 0x8fffffffffffffff;
+	if (idx < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA)
+	{
+		linBlockDeltaV = blockDeltaV[idx];
+		tBlockRigidId = blockRigidId[idx];
+
+		sBlockLinDeltaVW[idx] = linBlockDeltaV.w;
+		sBlockRigidId[idx] = tBlockRigidId;
+	}
+
+	__syncthreads(); //sBlockRigidId is written above and read below
+
+	PxReal tLinDeltaVW = linBlockDeltaV.w;
+	//add on block deltaV if blockRigid id match
+	for (PxU32 reductionRadius = 1; reductionRadius < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA; reductionRadius <<= 1)
+	{
+		const PxU32 lane = threadIndexInWarp - reductionRadius;
+		PxReal w = __shfl_sync(FULL_MASK, tLinDeltaVW, lane);
+
+		if (threadIndexInWarp >= reductionRadius && tBlockRigidId == sBlockRigidId[lane])
+		{
+			tLinDeltaVW += w;
+		}
+	}
+
+	__syncthreads(); //sBlockRigidId is read above and written below
+
+	if (idx < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA)
+	{
+		sBlockLinDeltaVW[idx] = tLinDeltaVW;
+		sBlockRigidId[idx] = blockRigidId[idx];
+	}
+
+	const PxU32 numIterationPerBlock = (totalBlockRequired + (PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA - 1)) / PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA;
+
+	__syncthreads();
+
+	PxgArticulationBlockLinkData* artiLinkData = artiCoreDesc->mArticulationLinkBlocks;
+
+	const PxU32 maxLinks = artiCoreDesc->mMaxLinksPerArticulation;
+
+	for (PxU32 i = 0; i < numIterationPerBlock; ++i)
+	{
+		const PxU32 workIndex = i * PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA + idx + numIterationPerBlock * blockIdx.x * blockDim.x;
+
+		PxU64 rigidId = 0x8fffffffffffffff;
+		if (workIndex < tNumContacts)
+		{
+			rigidId = sortedRigidIds[workIndex];
+			if (idx > 0)
+				sRigidId[idx - 1] = rigidId;
+
+			if (workIndex == tNumContacts - 1)
+			{
+				sRigidId[idx] = 0x8fffffffffffffff;
+			}
+			else if (threadIdx.x == PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA - 1)
+			{
+				// first thread in block must load neighbor particle 
+				sRigidId[idx] = sortedRigidIds[workIndex + 1];
+			}
+
+
+		}
+
+		__syncthreads();
+
+		if (workIndex < tNumContacts)
+		{
+			if (rigidId != sRigidId[idx])
+			{
+				PxReal linVelW = deltaV[workIndex].w;
+
+				PxU64 preBlockRigidId = blockIdx.x > 0 ? sBlockRigidId[blockIdx.x - 1] : 0x8fffffffffffffff;
+
+				if (rigidId == preBlockRigidId)
+				{
+					linVelW += sBlockLinDeltaVW[blockIdx.x - 1];
+				}
+
+				//nodeIndex
+				const PxNodeIndex nodeId = reinterpret_cast<PxNodeIndex&>(rigidId);
+
+				PxU32 solverBodyIndex = 0;
+
+				if (!nodeId.isStaticBody())
+				{
+					PxU32 nodeIndex = nodeId.index();
+					solverBodyIndex = prePrepDesc->solverBodyIndices[nodeIndex];
+
+					PxReal denom = globalRelaxationCoefficient;
+
+					if (useLocalRelax)
+						denom = PxMax(denom, linVelW);
+
+					if (nodeId.isArticulation())
+					{
+						//solverBodyIndex is the globalThreadIndex for the active articulation in the block format
+						const PxU32 blockIndex = solverBodyIndex / WARP_SIZE;
+
+						PxgArticulationBlockLinkData* artiLinks = &artiLinkData[blockIndex * maxLinks];
+
+						const PxU32 artiIndexInBlock = solverBodyIndex % WARP_SIZE;
+
+						const PxU32 linkID = nodeId.articulationLinkId();
+
+						atomicAdd(&artiLinks[linkID].mDeltaScale[artiIndexInBlock], denom);
+					}
+					else
+					{
+						atomicAdd(&tempDenom[solverBodyIndex], denom);
+						
+					}
+				}
+			}
+		}
+	}
+}
+
+//32 blocks. Each block compute the exclusive ransum for the blockOffset
+extern "C" __global__ void accumulateDeltaVRigidSecondLaunchMultiStage2(
+	PxU64*							sortedRigidIds,			//input
+	PxU32*							numContacts,			//input
+	float4*							deltaV,					//input
+	float4*							blockDeltaV,			//input
+	PxU64*							blockRigidId,			//input
+	PxgPrePrepDesc*								prePrepDesc,
+	PxgSolverCoreDesc*							solverCoreDesc,
+	PxgArticulationCoreDesc*					artiCoreDesc,
+	PxgSolverSharedDesc<IterativeSolveData>*	sharedDesc,
+	PxReal*							tempDenom,
+	const bool useLocalRelax,
+	const float globalRelaxationCoefficient,
+	bool isTGS
+)
+{
+	__shared__ float4 sBlockLinDeltaV[PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
+	__shared__ float4 sBlockAngDeltaV[PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
+	__shared__ PxU64 sBlockRigidId[PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
+	__shared__ PxU64 sRigidId[PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA + 1];
+
+	const PxU32 tNumContacts = *numContacts;
+	const PxU32 idx = threadIdx.x;
+
+	const PxU32 threadIndexInWarp = idx & (WARP_SIZE - 1);
+
+	const PxU32 totalBlockRequired = (tNumContacts + (PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA - 1)) / PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA;
+
+	float4 linBlockDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
+	float4 angBlockDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
+
+	PxU64 tBlockRigidId = 0x8fffffffffffffff;
+	if (idx < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA)
+	{
+		linBlockDeltaV = blockDeltaV[idx];
+		angBlockDeltaV = blockDeltaV[idx + PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
+		tBlockRigidId = blockRigidId[idx];
+
+		sBlockLinDeltaV[idx] = linBlockDeltaV;
+		sBlockAngDeltaV[idx] = angBlockDeltaV;
+		sBlockRigidId[idx] = tBlockRigidId;
+	}
+
+	__syncthreads(); //sBlockRigidId is written above and read below
+
+	float4 tLinDeltaV = linBlockDeltaV;
+	float4 tAngDeltaV = angBlockDeltaV;
+	//add on block deltaV if blockRigid id match
+	for (PxU32 reductionRadius = 1; reductionRadius < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA; reductionRadius <<= 1)
+	{
+		const PxU32 lane = threadIndexInWarp - reductionRadius;
+		float4 linVal = shuffle(FULL_MASK, tLinDeltaV, lane);
+		float4 angVal = shuffle(FULL_MASK, tAngDeltaV, lane);
+
+		if (threadIndexInWarp >= reductionRadius && tBlockRigidId == sBlockRigidId[lane])
+		{
+			tLinDeltaV += linVal;
+			tAngDeltaV += angVal;
+		}
+	}
+
+	__syncthreads(); //sBlockRigidId is read above and written below
+
+	if (idx < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA)
+	{
+		sBlockLinDeltaV[idx] = tLinDeltaV;
+		sBlockAngDeltaV[idx] = tAngDeltaV;
+		sBlockRigidId[idx] = blockRigidId[idx];
+	}
+
+	const PxU32 numIterationPerBlock = (totalBlockRequired + (PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA - 1)) / PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA;
+
+	__syncthreads();
+
+	float4* solverBodyDeltaVel = sharedDesc->iterativeData.solverBodyVelPool + solverCoreDesc->accumulatedBodyDeltaVOffset;
+	//float4* initialVel = solverCoreDesc->outSolverVelocity;
+	const PxU32 numSolverBodies = solverCoreDesc->numSolverBodies;
+
+	PxgArticulationBlockData* artiData = artiCoreDesc->mArticulationBlocks;
+	PxgArticulationBlockLinkData* artiLinkData = artiCoreDesc->mArticulationLinkBlocks;
+
+	const PxU32 maxLinks = artiCoreDesc->mMaxLinksPerArticulation;
+
+	for (PxU32 i = 0; i < numIterationPerBlock; ++i)
+	{
+		const PxU32 workIndex = i * PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA + idx + numIterationPerBlock * blockIdx.x * blockDim.x;
+
+		PxU64 rigidId = 0x8fffffffffffffff;
+		if (workIndex < tNumContacts)
+		{
+			rigidId = sortedRigidIds[workIndex];
+			if (idx > 0)
+				sRigidId[idx - 1] = rigidId;
+
+			if (workIndex == tNumContacts - 1)
+			{
+				sRigidId[idx] = 0x8fffffffffffffff;
+			}
+			else if (threadIdx.x == PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA - 1)
+			{
+				// first thread in block must load neighbor particle 
+				sRigidId[idx] = sortedRigidIds[workIndex + 1];
+			}
+
+
+		}
+
+		__syncthreads();
+
+		if (workIndex < tNumContacts)
+		{
+			float4 accumLin = make_float4(0.f, 0.f, 0.f, 0.f);
+			float4 accumAng = make_float4(0.f, 0.f, 0.f, 0.f);
+
+			if (rigidId != sRigidId[idx])
+			{
+				float4 linVel = deltaV[workIndex];
+				float4 angVel = deltaV[workIndex + tNumContacts];
+
+				PxU64 preBlockRigidId = blockIdx.x > 0 ? sBlockRigidId[blockIdx.x - 1] : 0x8fffffffffffffff;
+
+				if (rigidId == preBlockRigidId)
+				{
+					linVel += sBlockLinDeltaV[blockIdx.x - 1];
+					angVel += sBlockAngDeltaV[blockIdx.x - 1];
+				}
+
+				//nodeIndex
+				const PxNodeIndex nodeId = reinterpret_cast<PxNodeIndex&>(rigidId);
+
+				PxU32 solverBodyIndex = 0;
+
+				if (!nodeId.isStaticBody())
+				{
+					PxU32 nodeIndex = nodeId.index();
+					solverBodyIndex = prePrepDesc->solverBodyIndices[nodeIndex];
+
+					if (nodeId.isArticulation())
+					{
+						//solverBodyIndex is the globalThreadIndex for the active articulation in the block format
+						const PxU32 blockIndex = solverBodyIndex / WARP_SIZE;
+
+						PxgArticulationBlockData& articulation = artiData[blockIndex];
+						PxgArticulationBlockLinkData* artiLinks = &artiLinkData[blockIndex * maxLinks];
+
+
+						const PxU32 artiIndexInBlock = solverBodyIndex % WARP_SIZE;
+
+						articulation.mStateDirty[artiIndexInBlock] = PxgArtiStateDirtyFlag::eHAS_IMPULSES;
+
+						const PxU32 linkID = nodeId.articulationLinkId();
+
+						PxReal denom = artiLinks[linkID].mDeltaScale[artiIndexInBlock];
+						PxReal ratio = 1.f / denom;
+
+						//for articulation, linVel and angVel accumulate impulse
+						Cm::UnAlignedSpatialVector impulse;
+						impulse.top = PxVec3(linVel.x, linVel.y, linVel.z);
+						impulse.bottom = PxVec3(angVel.x, angVel.y, angVel.z);
+
+						impulse.top *= ratio;
+						impulse.bottom *= ratio;
+
+						/*printf("blockIndex %i artiIndexInBlock %i linkID %i ratio %f impulse linear(%f, %f, %f) angular(%f, %f, %f)\n", blockIndex, artiIndexInBlock, linkID, ratio,
+							impulse.top.x, impulse.top.y, impulse.top.z, impulse.bottom.x, impulse.bottom.y, impulse.bottom.z);*/
+
+						atomicAddSpatialVector(artiLinks[linkID].mScratchImpulse, -impulse, artiIndexInBlock);
+					}
+					else
+					{
+						PxReal denom = tempDenom[solverBodyIndex];// globalRelaxationCoefficient;
+
+						PxReal ratio = 1.f / denom;
+
+						if (isTGS)
+						{
+							atomicAdd(&solverBodyDeltaVel[solverBodyIndex].x, linVel.x*ratio);
+							atomicAdd(&solverBodyDeltaVel[solverBodyIndex].y, linVel.y*ratio);
+							atomicAdd(&solverBodyDeltaVel[solverBodyIndex].z, linVel.z*ratio);
+							atomicAdd(&solverBodyDeltaVel[solverBodyIndex].w, angVel.x*ratio);
+							atomicAdd(&solverBodyDeltaVel[solverBodyIndex + numSolverBodies].x, angVel.y*ratio);
+							atomicAdd(&solverBodyDeltaVel[solverBodyIndex + numSolverBodies].y, angVel.z*ratio);
+							//The rest is the delta position buffer
+						}
+						else
+						{
+
+							atomicAdd(&solverBodyDeltaVel[solverBodyIndex].x, linVel.x*ratio);
+							atomicAdd(&solverBodyDeltaVel[solverBodyIndex].y, linVel.y*ratio);
+							atomicAdd(&solverBodyDeltaVel[solverBodyIndex].z, linVel.z*ratio);
+							atomicAdd(&solverBodyDeltaVel[solverBodyIndex + numSolverBodies].x, angVel.x*ratio);
+							atomicAdd(&solverBodyDeltaVel[solverBodyIndex + numSolverBodies].y, angVel.y*ratio);
+							atomicAdd(&solverBodyDeltaVel[solverBodyIndex + numSolverBodies].z, angVel.z*ratio);
+
+						}
+					}
+					//printf("solverBodyIndex %i\n", solverBodyIndex);
+					//printf("linearVelocity(%f, %f, %f, %f)\n", linearVelocity.x, linearVelocity.y, linearVelocity.z, linearVelocity.w);
+					//printf("angularVelocity(%f, %f, %f, %f)\n", angularVelocity.x, angularVelocity.y, angularVelocity.z, angularVelocity.w);
+				}
+			}
+		}
+	}
+}
+
--- a/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/softBody.cu
+++ b/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/softBody.cu
--- a/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/softBody.cuh
+++ b/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/softBody.cuh
@@ -0,0 +1,150 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef __SOFT_BODY_CUH__
+#define __SOFT_BODY_CUH__
+
+#include "foundation/PxVecMath.h"
+#include "atomic.cuh"
+
+/**
+TODO, remove. Already has been removed from softbody/softbody, softbody/femcloth and softbody/particle attachments.
+*/
+__device__ inline PxReal getSoftBodyInvMass(const PxReal baryMass, const float4& bary)
+{
+	PxReal scale = PxSqrt(bary.x*bary.x + bary.y*bary.y + bary.z*bary.z + bary.w*bary.w);
+	return baryMass * scale;
+}
+
+static __device__ inline PxMat33 calculateDeformationGradient(
+	const PxVec3& u1,
+	const PxVec3& u2,
+	const PxVec3& u3,
+	const PxMat33& Qinv,
+	const PxMat33& RTranspose)
+{
+	// calculate deformation gradient
+	PxMat33 P = PxMat33(u1, u2, u3);
+	PxMat33 F = P * Qinv;
+
+	// remove rotation factor from strain, tranfrom into element space
+	F = RTranspose * F;
+
+	return F;
+}
+
+static __device__ float4 computeTetraContact(const float4* const vels, const uint4& tetrahedronId,
+	const float4& barycentric, float4& invMass)
+{
+	const float4 v0 = vels[tetrahedronId.x];
+	const float4 v1 = vels[tetrahedronId.y];
+	const float4 v2 = vels[tetrahedronId.z];
+	const float4 v3 = vels[tetrahedronId.w];
+
+	invMass = make_float4(v0.w, v1.w, v2.w, v3.w);
+
+	const float4 vel = v0 * barycentric.x + v1 * barycentric.y
+		+ v2 * barycentric.z + v3 * barycentric.w;
+
+	return vel;
+}
+
+static __device__ void updateTetraPosDelta(const float4& invMasses, const float4& barycentric, const uint4& tetrahedronId,
+	const PxVec3& deltaPos, float4* outputDeltaPoses, const PxReal addition = 1.f)
+{
+	if (invMasses.x > 0.f && PxAbs(barycentric.x) > 1e-6f)
+	{
+		const PxVec3 dP = deltaPos * (invMasses.x * barycentric.x);
+		AtomicAdd(outputDeltaPoses[tetrahedronId.x], dP, addition);
+	}
+
+	if (invMasses.y > 0.f && PxAbs(barycentric.y) > 1e-6f)
+	{
+		const PxVec3 dP = deltaPos * (invMasses.y * barycentric.y);
+		AtomicAdd(outputDeltaPoses[tetrahedronId.y], dP, addition);
+	}
+
+	if (invMasses.z > 0.f && PxAbs(barycentric.z) > 1e-6f)
+	{
+		const PxVec3 dP = deltaPos * (invMasses.z * barycentric.z);
+		AtomicAdd(outputDeltaPoses[tetrahedronId.z], dP, addition);
+	}
+
+	if (invMasses.w > 0.f && PxAbs(barycentric.w) > 1e-6f)
+	{
+		const PxVec3 dP = deltaPos * (invMasses.w * barycentric.w);
+		AtomicAdd(outputDeltaPoses[tetrahedronId.w], dP, addition);
+	}
+}
+
+static __device__ void updateTetPositionDelta(float4* outputDeltaPositions, const uint4& tetVertIndices,
+	const PxVec3& deltaPosition, const float4& invMassBary, const PxReal constraintWeight)
+{
+	//testing inverse mass and barycentric product for > 0, assuming that barycentric coordinates where clamped on construction.
+	if (invMassBary.x > 0.0f)
+	{
+		AtomicAdd(outputDeltaPositions[tetVertIndices.x], deltaPosition*invMassBary.x, constraintWeight);
+	}
+
+	if (invMassBary.y > 0.0f)
+	{
+		AtomicAdd(outputDeltaPositions[tetVertIndices.y], deltaPosition*invMassBary.y, constraintWeight);
+	}
+
+	if (invMassBary.z > 0.0f)
+	{
+		AtomicAdd(outputDeltaPositions[tetVertIndices.z], deltaPosition*invMassBary.z, constraintWeight);
+	}
+
+	if (invMassBary.w > 0.0f)
+	{
+		AtomicAdd(outputDeltaPositions[tetVertIndices.w], deltaPosition*invMassBary.w, constraintWeight);
+	}
+}
+
+static __device__ void updateTriPositionDelta(float4* outputDeltaPositions, const uint4& triVertIndices,
+	const PxVec3& deltaPosition, const float4& invMassBary, const PxReal constraintWeight)
+{
+	//testing inverse mass and barycentric product for > 0, assuming that barycentric coordinates where clamped on construction.
+	if (invMassBary.x > 0.0f)
+	{
+		AtomicAdd(outputDeltaPositions[triVertIndices.x], deltaPosition*invMassBary.x, constraintWeight);
+	}
+
+	if (invMassBary.y > 0.0f)
+	{
+		AtomicAdd(outputDeltaPositions[triVertIndices.y], deltaPosition*invMassBary.y, constraintWeight);
+	}
+
+	if (invMassBary.z > 0.0f)
+	{
+		AtomicAdd(outputDeltaPositions[triVertIndices.z], deltaPosition*invMassBary.z, constraintWeight);
+	}
+}
+
+#endif
--- a/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/softBodyGM.cu
+++ b/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/softBodyGM.cu
--- a/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/sparseGridStandalone.cu
+++ b/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/sparseGridStandalone.cu
@@ -0,0 +1,373 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "vector_types.h"
+#include "foundation/PxVec3.h"
+#include "foundation/PxVec4.h"
+#include "stdio.h"
+#include "assert.h"
+#include "cuda.h"
+
+#include "sparseGridStandalone.cuh"
+
+#define ENABLE_KERNEL_LAUNCH_ERROR_CHECK 0
+
+#define NEW_SUBGRID 0xfffffffe
+#define REUSED_SUBGRID 0xfffffffd
+
+extern "C" __host__ void initSparseGridStandaloneKernels0() {}
+
+extern "C" __global__ void sg_SparseGridCalcSubgridHashes(
+	PxSparseGridParams							sparseGridParams,
+	PxU32* PX_RESTRICT							indices,
+	PxU32* PX_RESTRICT							hashkeyPerParticle,
+	const PxVec4* const PX_RESTRICT			positions,
+	const PxU32									numParticles,
+	const PxU32* PX_RESTRICT					phases,
+	const PxU32									validPhaseMask,
+	const PxU32* PX_RESTRICT					activeIndices)
+{
+	PxU32 p = threadIdx.x + blockIdx.x * blockDim.x;
+	if (p >= numParticles)
+		return;
+
+	if (activeIndices)
+		p = activeIndices[p];
+
+	const PxVec3 subgridDomainSize = getSubgridDomainSize(sparseGridParams, 0/*sparseGridParams.haloSize*/);
+
+	const PxVec3 pos = positions[p].getXYZ();
+	const int3 subgridId = calcSubgridId(pos, subgridDomainSize);
+
+	bool isValidPhase = phases == NULL || (phases[p] & validPhaseMask);
+
+	indices[p] = p;
+	hashkeyPerParticle[p] = isValidPhase ? calcSubgridHash(subgridId) : EMPTY_SUBGRID;
+}
+
+__device__ void applyMask(PxU32* mask, const PxU32* PX_RESTRICT uniqueSortedHashkey, PxU32 hashkey, PxU32 maxNumSubgrids)
+{
+	if (hashkey == EMPTY_SUBGRID)
+		return;
+
+	PxU32 sortedIdx = 0;
+	const bool hashFound = tryFindHashkey(uniqueSortedHashkey, 27 * maxNumSubgrids, hashkey, sortedIdx);
+	if (hashFound)
+	{
+		if (mask[sortedIdx] == 1)
+			return; //Was already marked by another thread
+
+		mask[sortedIdx] = 1;
+
+		int i = sortedIdx - 1;
+		while (i >= 0 && uniqueSortedHashkey[i] == hashkey)
+			mask[i--] = 1;
+
+		i = sortedIdx + 1;
+		while (i < 27 * maxNumSubgrids && uniqueSortedHashkey[i] == hashkey)
+			mask[i++] = 1;
+	}
+}
+
+extern "C" __global__ void sg_SparseGridMarkRequiredNeighbors(
+	PxU32*									requiredNeighborMask,
+	const PxU32* PX_RESTRICT				uniqueSortedHashkey,
+	const PxSparseGridParams				sparseGridParams,
+	PxU32									neighborhoodSize,
+	const PxVec4*							particlePositions,
+	const PxU32								numParticles,
+	const PxU32* PX_RESTRICT				phases,
+	const PxU32								validPhaseMask,
+	const PxU32* PX_RESTRICT				activeIndices)
+{
+	PxU32 i = threadIdx.x + blockIdx.x * blockDim.x;
+	if (i >= numParticles)
+		return;
+
+	if (activeIndices)
+		i = activeIndices[i];
+
+	if (phases && !(phases[i] & validPhaseMask))
+		return; //Avoid to allocate sparse grids in regions of non-fluid particles
+
+	const PxVec3 xp = particlePositions[i].getXYZ();
+
+	const PxU32 haloSize = 0; // sparseGridParams.haloSize;
+	const PxVec3 subgridDomainSize = getSubgridDomainSize(sparseGridParams, haloSize);
+	const int3 subgridId = calcSubgridId(xp, subgridDomainSize); //subgridIdsPerParticle[i]; // flipSubgridHashToId(hashkey);
+	const PxReal dx = sparseGridParams.gridSpacing;
+	const PxReal invDx = 1.0f / dx;
+
+	const PxVec3 subgridOrigin = PxVec3(
+		subgridId.x * dx * (sparseGridParams.subgridSizeX - 2 * haloSize),
+		subgridId.y * dx * (sparseGridParams.subgridSizeY - 2 * haloSize),
+		subgridId.z * dx * (sparseGridParams.subgridSizeZ - 2 * haloSize));
+	const PxVec3 localXp = xp - subgridOrigin;
+
+	int3 gridBaseCoord;
+	gridBaseCoord.x = PxClamp(int(floor(localXp.x * invDx)), 0, int(int(sparseGridParams.subgridSizeX) - 2 * haloSize - 1));
+	gridBaseCoord.y = PxClamp(int(floor(localXp.y * invDx)), 0, int(int(sparseGridParams.subgridSizeY) - 2 * haloSize - 1));
+	gridBaseCoord.z = PxClamp(int(floor(localXp.z * invDx)), 0, int(int(sparseGridParams.subgridSizeZ) - 2 * haloSize - 1));
+
+	//Find the neighboring subgrids (step has values -1/0/1 for x/y/z) that need to exist
+	int3 step;
+	step.x = gridBaseCoord.x < neighborhoodSize ? -1 : (gridBaseCoord.x >= sparseGridParams.subgridSizeX - 2 * haloSize - neighborhoodSize ? 1 : 0);
+	step.y = gridBaseCoord.y < neighborhoodSize ? -1 : (gridBaseCoord.y >= sparseGridParams.subgridSizeY - 2 * haloSize - neighborhoodSize ? 1 : 0);
+	step.z = gridBaseCoord.z < neighborhoodSize ? -1 : (gridBaseCoord.z >= sparseGridParams.subgridSizeZ - 2 * haloSize - neighborhoodSize ? 1 : 0);
+
+	//Mark the neighbor subgrids that need to exist such that particles with a radius >0 near the subgrid boundary can transfer their density to the grid	
+	PxU32 buffer[8];
+	int indexer = 0;
+
+	buffer[indexer++] = calcSubgridHash(subgridId);
+
+	if (step.x != 0 && step.y != 0 && step.z != 0) buffer[indexer++] = subgridHashOffset(subgridId, step.x, step.y, step.z);
+
+	if (step.x != 0 && step.y != 0) buffer[indexer++] = subgridHashOffset(subgridId, step.x, step.y, 0);
+	if (step.x != 0 && step.z != 0) buffer[indexer++] = subgridHashOffset(subgridId, step.x, 0, step.z);
+	if (step.y != 0 && step.z != 0) buffer[indexer++] = subgridHashOffset(subgridId, 0, step.y, step.z);
+
+	if (step.x != 0) buffer[indexer++] = subgridHashOffset(subgridId, step.x, 0, 0);
+	if (step.y != 0) buffer[indexer++] = subgridHashOffset(subgridId, 0, step.y, 0);
+	if (step.z != 0) buffer[indexer++] = subgridHashOffset(subgridId, 0, 0, step.z);
+
+
+	for (int j = 0; j < indexer; ++j)
+		applyMask(requiredNeighborMask, uniqueSortedHashkey, buffer[j], sparseGridParams.maxNumSubgrids);
+}
+
+extern "C" __global__ void sg_SparseGridSortedArrayToDelta(
+	const PxU32*	in,
+	const PxU32*    mask,
+	PxU32*			out,
+	PxU32			n)
+{
+	const PxU32 i = threadIdx.x + blockIdx.x * blockDim.x;
+	if (i < n)
+	{
+		if (i < n - 1 && in[i] != in[i + 1])
+			out[i] = mask ? mask[i] : 1;
+		else
+			out[i] = 0;
+		if (i == n - 1)
+			out[i] = mask ? mask[i] : 1;
+	}
+}
+
+extern "C" __global__ void sg_SparseGridGetUniqueValues(
+	const PxU32*		sortedData,
+	const PxU32*		indices,
+	PxU32*				uniqueValues,
+	const PxU32			n,
+	PxU32*				subgridNeighborCollector,
+	const PxU32			uniqueValuesSize)
+{
+	const PxU32 i = threadIdx.x + blockIdx.x * blockDim.x;
+	if (i < n)
+	{
+		if (i == n - 1 || indices[i] != indices[i + 1])
+		{
+			if (indices[i] < uniqueValuesSize)
+			{
+				uniqueValues[indices[i]] = sortedData[i];
+
+				if (subgridNeighborCollector)
+				{
+					int4 id = subgridHashToId(sortedData[i]);
+					int indexer = 27 * indices[i];
+					for (int i = -1; i <= 1; ++i) for (int j = -1; j <= 1; ++j) for (int k = -1; k <= 1; ++k)
+						subgridNeighborCollector[indexer++] = calcSubgridHash(make_int3(id.x + i, id.y + j, id.z + k));
+				}
+			}
+		}
+	}
+}
+
+extern "C" __global__ void sg_SparseGridClearDensity(
+	PxReal* PX_RESTRICT		density,
+	const PxReal				clearValue,
+	const PxU32*				numActiveSubgrids,
+	const PxU32					subgridSize
+)
+{
+	const PxU32 idx = blockIdx.x * blockDim.x + threadIdx.x;
+	if (idx >= (numActiveSubgrids[0]) * subgridSize)
+		return;
+
+	density[idx] = clearValue;
+}
+
+extern "C" __global__ void sg_SparseGridBuildSubgridNeighbors(
+	const PxU32* PX_RESTRICT			uniqueSortedHashkey,
+	const PxU32* PX_RESTRICT			numActiveSubgrids,
+	const PxU32							maxNumSubgrids,
+	PxU32* PX_RESTRICT					subgridNeighbors
+)
+{
+	const PxU32 si = blockIdx.x * blockDim.x + threadIdx.x;
+
+	if (si >= maxNumSubgrids)
+		return;
+
+	const PxU32 hash = uniqueSortedHashkey[si];
+
+	int4 sID = subgridHashToId(hash);
+
+	subgridNeighbors[27 * si + SUBGRID_CENTER_IDX] = si;
+
+	for (int z = -1; z <= 1; ++z) for (int y = -1; y <= 1; ++y) for (int x = -1; x <= 1; ++x)
+	{
+		const int3 nID = make_int3(sID.x + x, sID.y + y, sID.z + z);
+		const PxU32 nHash = calcSubgridHash(nID);
+
+		PxU32 n = EMPTY_SUBGRID;
+		if (isSubgridInsideRange(nID))
+		{
+			PxU32 nSortedIdx = 0;
+			if (tryFindHashkey(uniqueSortedHashkey, numActiveSubgrids[0]/* + 1*/, nHash, nSortedIdx))
+				n = nSortedIdx;
+		}
+		subgridNeighbors[27 * si + subgridNeighborIndex(x, y, z)] = n;
+	}
+}
+
+extern "C" __global__ void sg_MarkSubgridEndIndices(const PxU32* sortedParticleToSubgrid, PxU32 numParticles, PxU32* subgridEndIndices)
+{
+	PxI32 threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
+	if (threadIndex >= numParticles)
+		return;
+
+	if (threadIndex < numParticles - 1)
+	{
+		if (sortedParticleToSubgrid[threadIndex] != sortedParticleToSubgrid[threadIndex + 1])
+			subgridEndIndices[sortedParticleToSubgrid[threadIndex]] = threadIndex + 1;
+	}
+	else
+		subgridEndIndices[sortedParticleToSubgrid[threadIndex]] = numParticles;
+}
+
+extern "C" __global__ void sg_ReuseSubgrids(
+	const PxSparseGridParams	sparseGridParams, 
+	const PxU32*				uniqueHashkeysPerSubgridPreviousUpdate,
+	const PxU32*				numActiveSubgridsPreviousUpdate,
+	PxU32*						subgridOrderMapPreviousUpdate,
+	
+	const PxU32*				uniqueHashkeysPerSubgrid,
+	const PxU32*				numActiveSubgrids,
+	PxU32*						subgridOrderMap)
+{
+	PxI32 threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
+	if (threadIndex >= sparseGridParams.maxNumSubgrids)
+		return;
+
+
+	if (threadIndex >= numActiveSubgrids[0]) 
+	{
+		subgridOrderMap[threadIndex] = EMPTY_SUBGRID;
+		return;
+	}
+
+	const PxU32 hashkey = uniqueHashkeysPerSubgrid[threadIndex];
+	PxU32 sortedIdx = 0;
+	const bool hashFound = tryFindHashkey(uniqueHashkeysPerSubgridPreviousUpdate, numActiveSubgridsPreviousUpdate[0], hashkey, sortedIdx);
+	if (!hashFound) 
+	{
+		subgridOrderMap[threadIndex] = NEW_SUBGRID;
+		return;
+	}
+
+	subgridOrderMap[threadIndex] = subgridOrderMapPreviousUpdate[sortedIdx];
+	subgridOrderMapPreviousUpdate[sortedIdx] = REUSED_SUBGRID;
+}
+
+PX_FORCE_INLINE __device__ void addIdToUnusedSubgridStack(PxU32 idToAddToStack, PxU32* unusedSubgridStackSize, PxU32* unusedSubgridStack)
+{
+	const PxU32 id = atomicAdd(unusedSubgridStackSize, 1);
+	unusedSubgridStack[id] = idToAddToStack;
+}
+
+PX_FORCE_INLINE __device__ PxU32 getSubgridIdFromUnusedStack(PxU32* unusedSubgridStackSize, PxU32* unusedSubgridStack)
+{
+	const PxU32 id = PxU32(atomicAdd(reinterpret_cast<PxI32*>(unusedSubgridStackSize), -1));
+	return unusedSubgridStack[id - 1];
+}
+
+//TODO: This method uses atomics. For better debuging, it might be worth to offer a slower variant that generates 100% reproducible results
+extern "C" __global__ void sg_AddReleasedSubgridsToUnusedStack(
+	const PxU32*	numActiveSubgridsPreviousUpdate,
+	const PxU32*	subgridOrderMapPreviousUpdate,
+	
+	PxU32*			unusedSubgridStackSize,
+	PxU32*			unusedSubgridStack)
+{
+	PxI32 threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
+	if (threadIndex >= numActiveSubgridsPreviousUpdate[0])
+		return;
+
+	if (subgridOrderMapPreviousUpdate[threadIndex] != REUSED_SUBGRID)
+		addIdToUnusedSubgridStack(subgridOrderMapPreviousUpdate[threadIndex], unusedSubgridStackSize, unusedSubgridStack);
+}
+
+//TODO: This method uses atomics. For better debuging, it might be worth to offer a slower variant that generates 100% reproducible results
+extern "C" __global__ void sg_AllocateNewSubgrids(
+	const PxU32*				numActiveSubgrids,
+	PxU32*						subgridOrderMap,
+	
+	PxU32*			unusedSubgridStackSize,
+	PxU32*			unusedSubgridStack,
+	
+	const PxU32*	numActiveSubgridsPreviousUpdate,
+	const PxU32		maxNumSubgrids)
+{
+	PxI32 threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
+	if (threadIndex >= numActiveSubgrids[0])
+		return;
+
+
+	if (numActiveSubgridsPreviousUpdate[0] == 0)
+	{
+		PxU32 numActiveSubgridsClamped = PxMin(maxNumSubgrids, numActiveSubgrids[0]);
+
+		//Special case to simplify debugging: If no subgrids were active in the previous frame, then all subgrids present now must be new
+		//Make sure that the subgrid indices in the first frame are always identical. But the order might change in subsequent frames due to the use of atomics
+		//subgridOrderMap[threadIndex] = unusedSubgridStack[maxNumSubgrids - threadIndex - 1];
+		subgridOrderMap[threadIndex] = unusedSubgridStack[maxNumSubgrids - numActiveSubgridsClamped + threadIndex]; //Use this line to test with non-default subgrid order to ensure that the code does not only work with the default order
+		if (threadIndex == 0)
+			unusedSubgridStackSize[0] -= numActiveSubgridsClamped;
+		//If launched with 1024 threads per block, one could do per block scan and support 100% reproducible subgrid allocations using a block scan if maxNumSubgrids<=1024
+	}
+	else
+	{
+		if (subgridOrderMap[threadIndex] == NEW_SUBGRID)
+		{
+			subgridOrderMap[threadIndex] = getSubgridIdFromUnusedStack(unusedSubgridStackSize, unusedSubgridStack);
+		}
+	}
+}
+
+
--- a/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/sparseGridStandalone.cuh
+++ b/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/sparseGridStandalone.cuh
@@ -0,0 +1,318 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
+
+#include "vector_types.h"
+#include "foundation/PxSimpleTypes.h"
+#include "PxSparseGridParams.h"
+#include "PxgSparseGridDataStandalone.h"
+#include <stdio.h>
+
+#define MAX_SPARSEGRID_DIM 1024
+#define MIN_SPARSEGRID_ID -512
+#define MAX_SPARSEGRID_ID 511
+#define EMPTY_SUBGRID 0xffffffff
+#define NEW_SUBGRID 0xfffffffe
+#define REUSED_SUBGRID 0xfffffffd
+#define OUT_OF_BOUNDS -1
+#define SUBGRID_CENTER_IDX 13
+
+using namespace physx;
+
+PX_FORCE_INLINE __device__ __host__ int clampValue(int f, int a, int b)
+{
+	return max(a, min(f, b));
+}
+
+__device__ inline PxVec3 getSubgridDomainSize(const PxSparseGridParams& params, const PxU32 haloSize)
+{
+	const PxReal dx = params.gridSpacing;
+	return PxVec3(dx * (params.subgridSizeX - 2 * haloSize), dx * (params.subgridSizeY - 2 * haloSize), dx * (params.subgridSizeZ - 2 * haloSize));
+}
+
+PX_FORCE_INLINE __host__ __device__ int3 calcSubgridId(const PxVec3 pos, const PxVec3 domainSize)
+{
+	return make_int3((int)PxFloor(pos.x / domainSize.x), (int)PxFloor(pos.y / domainSize.y), (int)PxFloor(pos.z / domainSize.z));
+}
+
+PX_FORCE_INLINE __host__ __device__ PxU32 calcSubgridHash(const int3 subgridId)
+{
+	const int3 shifted = make_int3(subgridId.x - int(MIN_SPARSEGRID_ID), subgridId.y - int(MIN_SPARSEGRID_ID), subgridId.z - int(MIN_SPARSEGRID_ID));
+	return MAX_SPARSEGRID_DIM * MAX_SPARSEGRID_DIM * shifted.z + MAX_SPARSEGRID_DIM * shifted.y + shifted.x;
+}
+
+PX_FORCE_INLINE __device__ PxU32 subgridHashOffset(int3 subgridId, int offsetX, int offsetY, int offsetZ)
+{
+	subgridId.x += offsetX;
+	subgridId.y += offsetY;
+	subgridId.z += offsetZ;
+	return calcSubgridHash(subgridId);
+}
+
+PX_FORCE_INLINE __host__ __device__ PxI32 subgridNeighborIndex(const PxI32 x, const PxI32 y, const PxI32 z)
+{
+	return ((x + 1) + 3 * (y + 1) + 9 * (z + 1));
+}
+
+template<class T>
+static __device__ PxU32 searchSorted(const T* PX_RESTRICT data, const PxU32 numElements, const T& value)
+{
+	PxU32 left = 0;
+	PxU32 right = numElements;
+
+	while ((right - left) > 1)
+	{
+		PxU32 pos = (left + right) >> 1;
+		const T& element = data[pos];
+		if (element <= value)
+			left = pos;
+		else
+			right = pos;
+	}
+
+	return left;
+}
+
+PX_FORCE_INLINE __device__ bool tryFindHashkey(const PxU32* const PX_RESTRICT sortedHashkey, const PxU32 numSubgrids, const PxU32 hashToFind, PxU32& result)
+{
+	result = searchSorted(sortedHashkey, numSubgrids, hashToFind);
+	return sortedHashkey[result] == hashToFind;
+}
+
+PX_FORCE_INLINE __host__ __device__ bool isSubgridInsideRange(const int3 val)
+{
+	return
+		val.x >= MIN_SPARSEGRID_ID && val.x <= MAX_SPARSEGRID_ID &&
+		val.y >= MIN_SPARSEGRID_ID && val.y <= MAX_SPARSEGRID_ID &&
+		val.z >= MIN_SPARSEGRID_ID && val.z <= MAX_SPARSEGRID_ID;
+}
+
+PX_FORCE_INLINE __host__ __device__ int4 subgridHashToId(const PxU32 hashKey)
+{
+	const int ihashKey = static_cast<int>(hashKey);
+	return make_int4(
+		ihashKey % MAX_SPARSEGRID_DIM + MIN_SPARSEGRID_ID,
+		(ihashKey / MAX_SPARSEGRID_DIM) % MAX_SPARSEGRID_DIM + MIN_SPARSEGRID_ID,
+		ihashKey / MAX_SPARSEGRID_DIM / MAX_SPARSEGRID_DIM + MIN_SPARSEGRID_ID, 0);
+}
+
+PX_FORCE_INLINE __host__ __device__ PxI32 subgridNeighborOffset(const PxU32* const PX_RESTRICT subgridNeighbors, PxI32 si, PxI32 offsetX, PxI32 offsetY, PxI32 offsetZ)
+{
+	return subgridNeighbors[27 * (si)+subgridNeighborIndex((offsetX), (offsetY), (offsetZ))];
+}
+
+PX_FORCE_INLINE __device__ PxI32 mod(PxI32 a, PxI32 b)
+{
+	return (a + b) % b;
+	//return (a + b) & (b - 1); //Assumes b is a power of two
+}
+
+PX_FORCE_INLINE __host__ __device__ PxU32 sparseGridAccess(const PxSparseGridParams& sparseGridParams, PxI32 i, PxI32 j, PxI32 k, PxI32 si, const PxU32* subgridOrdering)
+{
+	if (subgridOrdering)
+		si = subgridOrdering[si];
+	return i + sparseGridParams.subgridSizeX * (j + sparseGridParams.subgridSizeY * (k + sparseGridParams.subgridSizeZ * si));
+}
+
+PX_FORCE_INLINE __device__ PxU32 getIndex(const PxU32* const PX_RESTRICT subgridNeighbors, const PxSparseGridParams& sparseGridParams, PxI32 coordX, PxI32 coordY, PxI32 coordZ, PxU32 si, const PxU32* subgridOrdering)
+{
+	int haloSize = 0; // sparseGridParams.haloSize;
+	coordX += haloSize;
+	coordY += haloSize;
+	coordZ += haloSize;
+	const PxI32 stepX = coordX < 0 ? -1 : (coordX >= sparseGridParams.subgridSizeX ? 1 : 0);
+	const PxI32 stepY = coordY < 0 ? -1 : (coordY >= sparseGridParams.subgridSizeY ? 1 : 0);
+	const PxI32 stepZ = coordZ < 0 ? -1 : (coordZ >= sparseGridParams.subgridSizeZ ? 1 : 0);
+
+	//if (stepX != 0 || stepY != 0 || stepZ != 0)
+	//	printf("neighbor access\n");
+
+	PxU32 n = subgridNeighborOffset(subgridNeighbors, si, stepX, stepY, stepZ);
+	if (n == EMPTY_SUBGRID)
+		return EMPTY_SUBGRID;
+
+	return sparseGridAccess(sparseGridParams, mod(coordX, sparseGridParams.subgridSizeX), mod(coordY, sparseGridParams.subgridSizeY), mod(coordZ, sparseGridParams.subgridSizeZ), n, subgridOrdering);
+}
+
+//Assumes that 0.0 is a valid value for access outside of the grid
+PX_FORCE_INLINE __device__ PxReal getGridValue(const PxU32* const PX_RESTRICT subgridNeighbors, const PxReal* data, const PxSparseGridParams& sparseGridParams, PxI32 coordX, PxI32 coordY, PxI32 coordZ, PxU32 si, const PxU32* subgridOrdering)
+{
+	const PxU32 id = getIndex(subgridNeighbors, sparseGridParams, coordX, coordY, coordZ, si, subgridOrdering);
+	if (id == EMPTY_SUBGRID)
+		return 0.0f;
+	return data[id];
+}
+
+//This will transform p to cell local coordinates
+PX_FORCE_INLINE __device__ int4 getCellIndexFromPosition(PxVec3& p, const PxSparseGridParams& sparseGridParams, const PxU32* uniqueHashkeyPerSubgrid, const PxU32* numSubgridsInUse)
+{
+	int haloSize = 0; // sparseGridParams.haloSize;
+	const PxVec3 subgridDomainSize = getSubgridDomainSize(sparseGridParams, haloSize);
+	int3 subgridId = calcSubgridId(p, subgridDomainSize);
+	PxU32 subgridHash = calcSubgridHash(subgridId);
+
+	PxU32 sortedIdx = 0;
+	const bool hashFound = tryFindHashkey(uniqueHashkeyPerSubgrid, numSubgridsInUse[0], subgridHash, sortedIdx);
+	if (!hashFound)
+	{
+		//printf("Hash not found %i\n", subgridHash);
+		return make_int4(-1, -1, -1, OUT_OF_BOUNDS);
+	}
+
+	const PxReal dx = sparseGridParams.gridSpacing;
+	const PxReal invDx = 1.0f / dx;
+
+	const PxVec3 subgridOrigin = PxVec3(
+		subgridId.x * dx * (sparseGridParams.subgridSizeX - 2 * haloSize),
+		subgridId.y * dx * (sparseGridParams.subgridSizeY - 2 * haloSize),
+		subgridId.z * dx * (sparseGridParams.subgridSizeZ - 2 * haloSize));
+	p = p - subgridOrigin;
+
+	int4 result = make_int4(PxI32(PxFloor(p.x * invDx)), PxI32(PxFloor(p.y * invDx)), PxI32(PxFloor(p.z * invDx)), sortedIdx);
+	result.x = PxClamp(result.x, 0, sparseGridParams.subgridSizeX - 2 * haloSize-1);
+	result.y = PxClamp(result.y, 0, sparseGridParams.subgridSizeY - 2 * haloSize-1);
+	result.z = PxClamp(result.z, 0, sparseGridParams.subgridSizeZ - 2 * haloSize-1);
+	return result;
+}
+
+PX_FORCE_INLINE __device__ PxVec3 getLocationFromHashkey(const PxU32 hash, const PxSparseGridParams& sparseGridParams, const int4& index)
+{
+	int haloSize = 0; // sparseGridParams.haloSize;
+	const int4 subgridId = subgridHashToId(hash);
+	const PxVec3 subgridOrigin = PxVec3(
+		subgridId.x * sparseGridParams.gridSpacing * (sparseGridParams.subgridSizeX - 2 * haloSize),
+		subgridId.y * sparseGridParams.gridSpacing * (sparseGridParams.subgridSizeY - 2 * haloSize),
+		subgridId.z * sparseGridParams.gridSpacing * (sparseGridParams.subgridSizeZ - 2 * haloSize));
+	return subgridOrigin + PxVec3(index.x, index.y, index.z) * sparseGridParams.gridSpacing;
+}
+
+PX_FORCE_INLINE __device__ int4 getGridCoordinates(const PxSparseGridParams& sparseGridParams, int threadIndex)
+{
+	const PxU32 numSubgridCells = sparseGridParams.subgridSizeX * sparseGridParams.subgridSizeY * sparseGridParams.subgridSizeZ;
+	const PxU32 si = threadIndex / numSubgridCells;
+
+	PxI32 localThreadIndex = threadIndex - si * numSubgridCells;
+	const PxI32 xi = localThreadIndex % sparseGridParams.subgridSizeX;
+	const PxI32 yi = (localThreadIndex / sparseGridParams.subgridSizeX) % sparseGridParams.subgridSizeY;
+	const PxI32 zi = localThreadIndex / (sparseGridParams.subgridSizeX * sparseGridParams.subgridSizeY);
+
+	//Following code assumes that subgridSizeX and subgridSizeY are a power of two
+	/*const PxI32 xi = localThreadIndex & (sparseGridParams.subgridSizeX - 1);
+	const PxI32 yi = (localThreadIndex / sparseGridParams.subgridSizeX) & (sparseGridParams.subgridSizeY - 1);
+	const PxI32 zi = localThreadIndex / (sparseGridParams.subgridSizeX * sparseGridParams.subgridSizeY);*/
+
+	/*if(sparseGridParams.haloSize>0 &&
+		(xi < sparseGridParams.haloSize || yi < sparseGridParams.haloSize || zi < sparseGridParams.haloSize ||
+			xi >= sparseGridParams.subgridSizeX - sparseGridParams.haloSize || 
+			yi >= sparseGridParams.subgridSizeY - sparseGridParams.haloSize || 
+			zi >= sparseGridParams.subgridSizeZ - sparseGridParams.haloSize))
+		return make_int4(-1, -1, -1, OUT_OF_BOUNDS);*/
+
+	const PxU32 haloSize = 0; // sparseGridParams.haloSize;
+	return make_int4(xi - haloSize, yi - haloSize, zi - haloSize, si);
+}
+
+//Functions for the PxSparseGridData class - make sure they have the same name and aruments as their counterparts of the dense grid to simplify templating
+PX_FORCE_INLINE __device__ int4 getGridCoordinates(const PxSparseGridData& data, int threadIndex)
+{
+	return getGridCoordinates(data.mGridParams, threadIndex);
+}
+
+PX_FORCE_INLINE __device__ PxU32 getCellIndex(PxSparseGridData& data, int4 index, bool applySubgridOrder = true)
+{
+	int haloSize = 0; // data.mGridParams.haloSize;
+	index.x += haloSize;
+	index.y += haloSize;
+	index.z += haloSize;
+	return sparseGridAccess(data.mGridParams, index.x, index.y, index.z, index.w, applySubgridOrder ? data.mSubgridOrderMap : NULL);	
+}
+
+PX_FORCE_INLINE __device__ PxU32 getCellIndex(PxSparseGridData& data, const int4& index, PxI32 offsetX, PxI32 offsetY, PxI32 offsetZ, bool applySubgridOrder = true)
+{
+	/*if (applySubgridOrder && data.subgridOrderMap)
+	{
+		if (index.w < 0 || data.subgridOrderMap[index.w] < 0 || data.subgridOrderMap[index.w] >= data.mNumSubgridsInUse[0])
+			printf("problem\n");
+	}*/
+
+	return getIndex(data.mSubgridNeighbors, data.mGridParams, index.x + offsetX, index.y + offsetY, index.z + offsetZ, index.w, applySubgridOrder ? data.mSubgridOrderMap : NULL);
+}
+
+PX_FORCE_INLINE __device__ PxU32 getCellIndexSafe(PxSparseGridData& data, const int4& index, PxI32 offsetX, PxI32 offsetY, PxI32 offsetZ, bool applySubgridOrder = true)
+{
+	return getCellIndex(data, index, offsetX, offsetY, offsetZ, applySubgridOrder);
+}
+
+//Assumes that 0.0 is a valid value for access outside of the grid
+PX_FORCE_INLINE __device__ PxReal getGridValue(PxSparseGridData& data, const PxReal* dataSource, const int4& index, PxI32 offsetX, PxI32 offsetY, PxI32 offsetZ)
+{
+	return getGridValue(data.mSubgridNeighbors, dataSource, data.mGridParams, index.x + offsetX, index.y + offsetY, index.z + offsetZ, index.w, data.mSubgridOrderMap);
+}
+
+//Assumes that 0.0 is a valid value for access outside of the grid
+PX_FORCE_INLINE __device__ PxReal getGridValueSafe(PxSparseGridData& data, const PxReal* dataSource, const int4& index, PxI32 offsetX, PxI32 offsetY, PxI32 offsetZ)
+{
+	return getGridValue(data, dataSource, index, offsetX, offsetY, offsetZ);
+}
+
+PX_FORCE_INLINE __device__ bool outOfRange(PxSparseGridData& data, const int threadIndex)
+{
+	const PxSparseGridParams& sparseGridParams = data.mGridParams;
+	return threadIndex >= sparseGridParams.maxNumSubgrids * sparseGridParams.subgridSizeX * sparseGridParams.subgridSizeY * sparseGridParams.subgridSizeZ;
+}
+
+PX_FORCE_INLINE __device__ bool outOfActiveCells(PxSparseGridData& data, const int threadIndex)
+{
+	const PxSparseGridParams& sparseGridParams = data.mGridParams;
+	return threadIndex >= data.mNumSubgridsInUse[0] * sparseGridParams.subgridSizeX * sparseGridParams.subgridSizeY * sparseGridParams.subgridSizeZ;
+}
+
+PX_FORCE_INLINE __device__ bool outOfBounds(PxSparseGridData& data, const int4& index)
+{
+	return index.w >= data.mNumSubgridsInUse[0] || index.w >= data.mGridParams.maxNumSubgrids || index.w == OUT_OF_BOUNDS;
+	//return data.subgridMask[index.w] == SUBGRID_INACTIVE;
+}
+
+PX_FORCE_INLINE __device__ bool isLastCell(PxSparseGridData& data, const int threadIndex)
+{
+	const PxSparseGridParams& sparseGridParams = data.mGridParams;
+	return threadIndex == sparseGridParams.maxNumSubgrids * sparseGridParams.subgridSizeX * sparseGridParams.subgridSizeY * sparseGridParams.subgridSizeZ - 1;
+}
+
+PX_FORCE_INLINE __device__ PxVec3 getLocation(PxSparseGridData& data, const int4& index)
+{
+	const PxSparseGridParams& sparseGridParams = data.mGridParams;
+	const PxU32 hash = data.mUniqueHashkeyPerSubgrid[index.w];
+	return getLocationFromHashkey(hash, sparseGridParams, index);
+}
+
+//This will transform p to cell local coordinates
+PX_FORCE_INLINE __device__ int4 getCellIndexFromParticleAndTransformToLocalCoordinates(PxSparseGridData& data, PxVec3& p)
+{
+	return getCellIndexFromPosition(p, data.mGridParams, data.mUniqueHashkeyPerSubgrid, data.mNumSubgridsInUse);
+}
+
--- a/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/updateBodiesAndShapes.cu
+++ b/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/updateBodiesAndShapes.cu
--- a/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/updateTransformAndBoundArray.cu
+++ b/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/updateTransformAndBoundArray.cu
@@ -0,0 +1,428 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "foundation/PxPreprocessor.h"
+
+#include "PxgCommonDefines.h"
+#include "PxgSimulationCoreKernelIndices.h"
+#include "PxgSimulationCoreDesc.h"
+#include "PxgSolverBody.h"
+#include "PxvDynamics.h"
+#include "PxgShapeSim.h"
+#include "PxgBodySim.h"
+#include "cutil_math.h"
+#include "reduction.cuh"
+#include "updateCacheAndBound.cuh"
+#include "PxsRigidBody.h"
+#include "PxgArticulation.h"
+#include "PxgAggregate.h"
+#include "PxgAABBManager.h"
+#include <assert.h>
+#include <stdio.h>
+
+using namespace physx;
+
+extern "C" __host__ void initSimulationControllerKernels1() {}
+
+extern "C" __global__ void mergeTransformCacheAndBoundArrayChanges(
+    PxBounds3* PX_RESTRICT deviceBounds,
+    PxsCachedTransform* PX_RESTRICT deviceTransforms,
+    const PxBounds3* PX_RESTRICT boundsArray,
+    const PxsCachedTransform* PX_RESTRICT transformsArray,
+    const PxBoundTransformUpdate* PX_RESTRICT changes,
+    const PxU32 numChanges
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < numChanges) {
+        PxU32 indexTo = changes[idx].indexTo;
+        PxU32 indexFrom = changes[idx].indexFrom & 0x7FFFFFFF;
+        bool isNew = (changes[idx].indexFrom & (1U << 31)) != 0;
+        if (isNew) {
+            deviceBounds[indexTo] = boundsArray[indexFrom];
+            deviceTransforms[indexTo] = transformsArray[indexFrom];
+        } else {
+            deviceBounds[indexTo] = deviceBounds[indexFrom];
+            deviceTransforms[indexTo] = deviceTransforms[indexFrom];
+        }
+    }
+}
+
+extern "C" __global__ void updateTransformCacheAndBoundArrayLaunch(const PxgSimulationCoreDesc* scDesc)
+{
+	const PxgSolverBodySleepData* PX_RESTRICT gSleepData = scDesc->mSleepData;
+
+	const PxU32* PX_RESTRICT gBodyDataIndices = scDesc->mBodyDataIndices;
+	const PxgShape* PX_RESTRICT gShapes = scDesc->mShapes;
+
+	const PxgBodySim* PX_RESTRICT gBodySimPool = scDesc->mBodySimBufferDeviceData;
+
+	const PxU32 gNumShapes = scDesc->mNbTotalShapes;
+	const PxgShapeSim* PX_RESTRICT gShapeSimPool = scDesc->mShapeSimsBufferDeviceData;
+
+	const PxgArticulation* PX_RESTRICT gArticulations = scDesc->mArticulationPool;
+	const PxgSolverBodySleepData* PX_RESTRICT gArticulationSleepData = scDesc->mArticulationSleepDataPool;
+	
+	PxsCachedTransform* PX_RESTRICT gTransformCache = scDesc->mTransformCache;
+	PxBounds3* PX_RESTRICT gBounds = scDesc->mBounds;
+
+	//Each shape has a corresponding unfrozen element
+	PxU32* PX_RESTRICT frozen = scDesc->mFrozen;
+	PxU32* PX_RESTRICT unfrozen = scDesc->mUnfrozen;
+	//Each shape has a updated element corresponding to the elementIndex
+	PxU32* PX_RESTRICT updated = scDesc->mUpdated;
+
+	//Each body has a corresponding active and deactive element
+	PxU32* PX_RESTRICT active = scDesc->mActivate;
+	PxU32* PX_RESTRICT deactivate = scDesc->mDeactivate;
+
+	const PxU32 idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+	for(PxU32 i=idx; i<gNumShapes; i+=blockDim.x * gridDim.x)
+	{
+		const PxgShapeSim& shapeSim = gShapeSimPool[i];
+
+		const PxNodeIndex bodySimNodeIndex = shapeSim.mBodySimIndex; // bodySimIndex is the same as nodeIndex in the IG
+
+		//not static body or deleted shape
+		if (!bodySimNodeIndex.isStaticBody())
+		{
+			const PxU32 elementIndex = i; // this is the transform cache and bound array index
+			const PxU32 bodySimIndex = bodySimNodeIndex.index();
+			//printf("i %i bodySimIndex %i\n", idx, bodySimIndex );
+			
+			const PxgBodySim& bodySim = gBodySimPool[bodySimIndex];
+
+			const PxU32 shapeFlags = shapeSim.mShapeFlags;
+			bool isBP = (shapeFlags & PxU32(PxShapeFlag::eSIMULATION_SHAPE | PxShapeFlag::eTRIGGER_SHAPE));
+
+			bool isBPOrSq = (shapeFlags & PxU32(PxShapeFlag::eSIMULATION_SHAPE | PxShapeFlag::eTRIGGER_SHAPE | PxShapeFlag::eSCENE_QUERY_SHAPE));
+
+			if (!bodySimNodeIndex.isArticulation())
+			{
+				const PxU32 activeNodeIndex = gBodyDataIndices[bodySimIndex];
+								
+				//if activeNodeIndex is valid, which means this node is active
+				if (activeNodeIndex != 0xFFFFFFFF)
+				{
+					const PxU32 internalFlags = gSleepData[activeNodeIndex].internalFlags;
+					const PxTransform body2World = bodySim.body2World.getTransform();
+
+					if ((internalFlags & PxsRigidBody::eFREEZE_THIS_FRAME) && (internalFlags &  PxsRigidBody::eFROZEN))
+					{
+						frozen[i] = 1;
+						gTransformCache[elementIndex].flags = PxsTransformFlag::eFROZEN;
+					}
+					else if (internalFlags & PxsRigidBody::eUNFREEZE_THIS_FRAME)
+					{
+						unfrozen[i] = 1;
+					}
+
+					if (!(internalFlags & PxsRigidBody::eFROZEN) || (internalFlags & PxsRigidBody::eFREEZE_THIS_FRAME))
+					{					
+						if (isBP)
+							updated[elementIndex] = 1;
+
+						const PxTransform absPos = getAbsPose(body2World, shapeSim.mTransform, bodySim.body2Actor_maxImpulseW.getTransform());
+
+						updateCacheAndBound(absPos, shapeSim, elementIndex, gTransformCache, gBounds, gShapes, isBPOrSq);
+					}
+
+					if (internalFlags & PxsRigidBody::eACTIVATE_THIS_FRAME)
+						active[bodySimIndex] = 1;
+					else if (internalFlags & PxsRigidBody::eDEACTIVATE_THIS_FRAME)
+						deactivate[bodySimIndex] = 1;
+				}
+			}
+			else
+			{			
+				//This is articulation
+				const PxU32 articulationId = bodySim.articulationRemapId;
+				const PxgArticulation& articulation = gArticulations[articulationId];
+				const PxgSolverBodySleepData artiSleepData = gArticulationSleepData[articulationId];
+				const PxU32 internalFlags = artiSleepData.internalFlags;
+
+				const PxU32 linkId = bodySimNodeIndex.articulationLinkId();
+
+				const PxTransform body2World = articulation.linkBody2Worlds[linkId];
+
+				if (isBP)
+					updated[elementIndex] = 1;
+
+				const PxTransform body2Actor = articulation.linkBody2Actors[linkId];
+
+				const PxTransform absPos = getAbsPose(body2World, shapeSim.mTransform, body2Actor);
+
+				updateCacheAndBound(absPos, shapeSim, elementIndex, gTransformCache, gBounds, gShapes, isBPOrSq);
+
+				if (internalFlags & PxsRigidBody::eACTIVATE_THIS_FRAME)
+					active[bodySimIndex] = 1;
+				else if (internalFlags & PxsRigidBody::eDEACTIVATE_THIS_FRAME)
+					deactivate[bodySimIndex] = 1;
+			}
+		}
+	}
+}
+
+//after  updateTransformCacheAndBoundArrayLaunch, we need to update the flags in the transform cache
+extern "C" __global__ void updateChangedAABBMgrHandlesLaunch(const PxgSimulationCoreDesc* scDesc)
+{
+	const PxU32 gNumElements = scDesc->mBitMapWordCounts * 32;
+
+	const PxU32* updated = scDesc->mUpdated;
+
+	PxU32* gChangedAABBMgrHandles = scDesc->mChangedAABBMgrHandles;
+
+	const PxU32 idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+	const PxU32 threadIndexInWarp = threadIdx.x &  (WARP_SIZE -1);
+
+	for (PxU32 i = idx; i<gNumElements; i += blockDim.x * gridDim.x)
+	{
+		const PxU32 updateBit = updated[i];
+		const PxU32 word = __ballot_sync(FULL_MASK, updateBit);
+
+		if(threadIndexInWarp == 0)
+		{
+			gChangedAABBMgrHandles[i/WARP_SIZE] = word;
+		}
+	}
+}
+
+//This kernel merge direct API updated handle and the CPU API updated handle
+extern "C" __global__ void mergeChangedAABBMgrHandlesLaunch(const PxgUpdateActorDataDesc* updateActorDesc)
+{
+	//max number of shapes
+	const PxU32 gNumElements = updateActorDesc->mBitMapWordCounts * 32;
+
+	//This is Direct API changed handles
+	const PxU32* updated = updateActorDesc->mUpdated;
+
+	//This is CPU API changed handles
+	PxU32* gChangedAABBMgrHandles = updateActorDesc->mChangedAABBMgrHandles;
+
+	const PxU32 idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+	const PxU32 threadIndexInWarp = threadIdx.x &  (WARP_SIZE - 1);
+
+	for (PxU32 i = idx; i < gNumElements; i += blockDim.x * gridDim.x)
+	{
+		const PxU32 updateBit = updated[i];
+
+		const PxU32 word = __ballot_sync(FULL_MASK, updateBit);
+
+		if (threadIndexInWarp == 0)
+		{
+			gChangedAABBMgrHandles[i / WARP_SIZE] |= word;
+		}
+	}
+}
+
+extern "C" __global__ void computeFrozenAndUnfrozenHistogramLaunch(const PxgSimulationCoreDesc* scDesc)
+{
+	const PxU32 WARP_PERBLOCK_SIZE = PxgSimulationCoreKernelBlockDim::COMPUTE_FROZEN_UNFROZEN_HISTOGRAM/WARP_SIZE;
+	const PxU32 LOG2_WARP_PERBLOCK_SIZE = 3;
+
+	assert((1 << LOG2_WARP_PERBLOCK_SIZE) == WARP_PERBLOCK_SIZE);
+
+	__shared__ PxU32 sFrozenWarpAccumulator[WARP_PERBLOCK_SIZE];
+	__shared__ PxU32 sUnFrozenWarpAccumulator[WARP_PERBLOCK_SIZE];
+
+	__shared__ PxU32 sFrozenBlockAccumulator;
+	__shared__ PxU32 sUnfrozenBlockAccumulator;
+
+	PxU32* gFrozen = scDesc->mFrozen;
+	PxU32* gUnfrozen = scDesc->mUnfrozen;
+	PxU32* gFrozenBlock = scDesc->mFrozenBlockAndRes;
+	PxU32* gUnfrozenBlock = scDesc->mUnfrozenBlockAndRes;
+
+	const PxU32 gNbTotalShapes = scDesc->mNbTotalShapes;
+
+	const PxU32 nbBlocksRequired = (gNbTotalShapes + blockDim.x-1)/blockDim.x;
+
+	const PxU32 nbIterationsPerBlock = (nbBlocksRequired + gridDim.x-1)/gridDim.x;
+
+	const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE-1);
+	const PxU32 warpIndex = threadIdx.x/(WARP_SIZE);
+	const PxU32 idx = threadIdx.x;
+
+	if(threadIdx.x == 0)
+	{
+		sFrozenBlockAccumulator = 0;
+		sUnfrozenBlockAccumulator = 0;
+	}
+
+	__syncthreads();
+
+	for(PxU32 i = 0; i < nbIterationsPerBlock; ++i)
+	{
+		const PxU32 workIndex = i*WARP_SIZE*WARP_PERBLOCK_SIZE + idx + nbIterationsPerBlock * blockIdx.x * blockDim.x;
+
+		//frozen/unfrozen is either 0 or 1
+		PxU32 frozen = 0, unfrozen = 0;
+		if(workIndex < gNbTotalShapes)
+		{
+			frozen = gFrozen[workIndex];
+			unfrozen = gUnfrozen[workIndex];
+		}
+
+		const PxU32 threadMask = (1<<threadIndexInWarp)-1;
+
+		const PxU32 frozenAccumVal = __popc(__ballot_sync(FULL_MASK, frozen)&threadMask);
+		const PxU32 unfrozenAccumVal = __popc(__ballot_sync(FULL_MASK, unfrozen)&threadMask);
+
+		if(threadIndexInWarp == (WARP_SIZE-1))
+		{
+			sFrozenWarpAccumulator[warpIndex] = frozenAccumVal + frozen;
+			sUnFrozenWarpAccumulator[warpIndex] = unfrozenAccumVal + unfrozen;
+		}
+
+		const PxU32 prevFrozenBlockAccumulator = sFrozenBlockAccumulator;
+		const PxU32 prevUnfrozenBlockAccumulator = sUnfrozenBlockAccumulator;
+
+		__syncthreads();
+
+		unsigned mask_idx = __ballot_sync(FULL_MASK, idx < WARP_PERBLOCK_SIZE);
+		if(idx < WARP_PERBLOCK_SIZE)
+		{
+			const PxU32 frozenValue = sFrozenWarpAccumulator[threadIndexInWarp];
+			const PxU32 unfrozenValue = sUnFrozenWarpAccumulator[threadIndexInWarp];
+			const PxU32 frozenOutput = warpScan<AddOpPxU32, PxU32, LOG2_WARP_PERBLOCK_SIZE>(mask_idx, frozenValue) - frozenValue;
+			const PxU32 unfrozenOutput = warpScan<AddOpPxU32, PxU32, LOG2_WARP_PERBLOCK_SIZE>(mask_idx, unfrozenValue) - unfrozenValue;
+			sFrozenWarpAccumulator[threadIndexInWarp] = frozenOutput;
+			sUnFrozenWarpAccumulator[threadIndexInWarp] = unfrozenOutput;
+			//const PxU32 output = warpScanAddWriteToSharedMem<WARP_PERBLOCK_SIZE>(idx, threadIndexInWarp, sWarpAccumulator, value, value);
+			if(threadIndexInWarp == (WARP_PERBLOCK_SIZE-1))
+			{
+				sFrozenBlockAccumulator +=(frozenOutput + frozenValue);
+				sUnfrozenBlockAccumulator +=(unfrozenOutput + unfrozenValue);
+			}
+		}
+		
+		__syncthreads();
+
+		if(workIndex < gNbTotalShapes)
+		{
+			//Now output both histograms...
+			gFrozen[workIndex] = frozenAccumVal + prevFrozenBlockAccumulator + sFrozenWarpAccumulator[warpIndex];
+			gUnfrozen[workIndex] = unfrozenAccumVal + prevUnfrozenBlockAccumulator + sUnFrozenWarpAccumulator[warpIndex];
+		}
+	}
+
+	if(threadIdx.x == 0)
+	{
+		gFrozenBlock[blockIdx.x] = sFrozenBlockAccumulator;
+		gUnfrozenBlock[blockIdx.x] = sUnfrozenBlockAccumulator;
+	}
+}
+
+extern "C" __global__ void outputFrozenAndUnfrozenHistogram(PxgSimulationCoreDesc* scDesc)
+{
+	const PxU32 nbBlocks = PxgSimulationCoreKernelGridDim::OUTPUT_FROZEN_UNFROZEN_HISTOGRAM;
+	PX_COMPILE_TIME_ASSERT(nbBlocks == 32);
+
+	__shared__ PxU32 sFrozenBlockAccum[nbBlocks];
+	__shared__ PxU32 sUnfrozenBlockAccum[nbBlocks];
+
+	const PxU32 idx = threadIdx.x;
+
+	PxU32* gFrozen = scDesc->mFrozen;
+	PxU32* gUnfrozen = scDesc->mUnfrozen;
+	PxU32* gFrozenBlock = scDesc->mFrozenBlockAndRes;
+	PxU32* gUnfrozenBlock = scDesc->mUnfrozenBlockAndRes;
+
+	const PxU32 gNbTotalShapes = scDesc->mNbTotalShapes;
+	const PxU32 globalThreadIndex = threadIdx.x + blockDim.x*blockIdx.x;
+
+	PxU32 frozen = 0;
+	PxU32 frozenOutput = 0;
+	PxU32 unfrozen = 0;
+	PxU32 unfrozenOutput = 0;
+
+	unsigned mask_idx = __ballot_sync(FULL_MASK, idx < nbBlocks);
+	if(idx < nbBlocks)
+	{
+		frozen = gFrozenBlock[idx];
+		frozenOutput =  warpScan<AddOpPxU32, PxU32>(mask_idx, frozen) - frozen;
+		sFrozenBlockAccum[idx] = frozenOutput;
+
+		unfrozen = gUnfrozenBlock[idx];
+		unfrozenOutput =  warpScan<AddOpPxU32, PxU32>(mask_idx, unfrozen) - unfrozen;
+		sUnfrozenBlockAccum[idx] = unfrozenOutput;
+	}
+
+	if(globalThreadIndex == (nbBlocks-1))
+	{
+		scDesc->mTotalFrozenShapes = frozenOutput + frozen;
+		scDesc->mTotalUnfrozenShapes = unfrozenOutput + unfrozen;
+	}
+
+	const PxU32 totalBlockRequired = (gNbTotalShapes + (blockDim.x-1))/ blockDim.x;
+
+	const PxU32 numIterationPerBlock = (totalBlockRequired + (nbBlocks-1))/ nbBlocks;
+
+	__syncthreads();
+	
+	const PxU32 frozenBlockAccum = sFrozenBlockAccum[blockIdx.x];
+	const PxU32 unfrozenBlockAccum = sUnfrozenBlockAccum[blockIdx.x];
+
+	for(PxU32 i=0; i<numIterationPerBlock; ++i)
+	{
+		const PxU32 workIndex = i * blockDim.x + idx + numIterationPerBlock * blockIdx.x * blockDim.x;
+
+		if(workIndex < gNbTotalShapes)
+		{
+			gFrozen[workIndex] = gFrozen[workIndex] + frozenBlockAccum;
+			gUnfrozen[workIndex] = gUnfrozen[workIndex] + unfrozenBlockAccum;
+		}
+	}
+}
+
+extern "C" __global__ void createFrozenAndUnfrozenArray(const PxgSimulationCoreDesc* scDesc)
+{
+	PxU32* gFrozen = scDesc->mFrozen;
+	PxU32* gUnfrozen = scDesc->mUnfrozen;
+
+	PxU32* gFrozenRes = scDesc->mFrozenBlockAndRes;
+	PxU32* gUnfrozenRes = scDesc->mUnfrozenBlockAndRes;
+
+	const PxU32 gNbTotalShapes = scDesc->mNbTotalShapes;
+	const PxU32 gNbFrozenTotalShapes = scDesc->mTotalFrozenShapes;
+	const PxU32 gNbUnfrozenTotalShapes = scDesc->mTotalUnfrozenShapes;
+
+	const PxU32 idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+	for(PxU32 i=idx; i < gNbFrozenTotalShapes; i+= blockDim.x * gridDim.x)
+	{
+		gFrozenRes[i] = binarySearch<PxU32>(gFrozen, gNbTotalShapes, i);
+	}
+
+	for(PxU32 i=idx; i< gNbUnfrozenTotalShapes; i+= blockDim.x * gridDim.x)
+	{
+		gUnfrozenRes[i] = binarySearch<PxU32>(gUnfrozen, gNbTotalShapes, i);
+	}
+}