feat(physics): wire physx sdk into build

2026-04-15 12:22:15 +08:00
parent 5bf258df6d
commit 31f40e2cbb
2044 changed files with 752623 additions and 1 deletions
--- a/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/rigidDeltaAccum.cu
+++ b/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/rigidDeltaAccum.cu
@@ -0,0 +1,875 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "vector_types.h"
+#include "foundation/PxVec3.h"
+#include "cutil_math.h"
+#include "PxgParticleSystemCoreKernelIndices.h"
+#include "reduction.cuh"
+#include "shuffle.cuh"
+#include "PxgSolverCoreDesc.h"
+#include "PxNodeIndex.h"
+#include "assert.h"
+#include "PxgSimulationCoreDesc.h"
+#include "PxgArticulationCoreDesc.h"
+
+using namespace physx;
+
+extern "C" __host__ void initSimulationControllerKernels2() {}
+
+/*
+ * This kernel takes a *sorted* list of PxNodeIndex (of size *numContacts), 
+ * and a corresponding deltaV list.
+ * 
+ * deltaV is updated with the cumulativ delta values, such that for each last entry of a rigid body deltaV is the total sum 
+ * for that rigid body - *however* only for the rigid body entries which are processed within one block.
+ * blockRigidId, blockDeltaV are updated, such that entry represents the deltaV sum and rigid body ID, of the last
+ * occuring rigid body in that corresponding block. This can then be used in the subsequent kernel to complete the sum 
+ * for rigid bodies, which entries are overlapping a block.
+ */
+extern "C" __global__ void accumulateDeltaVRigidFirstLaunch(
+	const PxU64*					sortedRigidIds,			//input
+	const PxU32*					numContacts,			//input
+	float4*							deltaV,					//input/output
+	float4*							blockDeltaV,			//output
+	PxU64*							blockRigidId			//output
+)
+{
+	__shared__ PxU64 sRigidId[PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA + 1];
+
+	//numWarpsPerBlock can't be larger than 32
+	const PxU32 numWarpsPerBlock = PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA / WARP_SIZE;
+
+	__shared__ float4 sLinWarpAccumulator[WARP_SIZE];
+	__shared__ float4 sAngWarpAccumulator[WARP_SIZE];
+	__shared__ PxU64  sWarpRigidId[WARP_SIZE];
+	__shared__ float4 sLinBlockAccumulator;
+	__shared__ float4 sAngBlockAccumulator;
+	__shared__ PxU64 sBlockRigidId;
+
+	const PxU32 tNumContacts = *numContacts;
+	const PxU32 nbBlocksRequired = (tNumContacts + blockDim.x - 1) / blockDim.x;
+	const PxU32 nbIterationsPerBlock = (nbBlocksRequired + gridDim.x - 1) / gridDim.x;
+	const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE - 1);
+	const PxU32 warpIndex = threadIdx.x / WARP_SIZE;
+
+	if (threadIdx.x < 4)
+	{
+		float* tLinBlockAccumulator = reinterpret_cast<float*>(&sLinBlockAccumulator.x);
+		tLinBlockAccumulator[threadIdx.x] = 0.f;
+
+		float* tAngBlockAccumulator = reinterpret_cast<float*>(&sAngBlockAccumulator.x);
+		tAngBlockAccumulator[threadIdx.x] = 0.f;
+
+		if (threadIdx.x == 0)
+			sBlockRigidId = 0x8fffffffffffffff;
+	}
+	__syncthreads();
+
+	for (PxU32 i = 0; i < nbIterationsPerBlock; ++i)
+	{
+		const PxU32 workIndex = blockDim.x*(blockIdx.x*nbIterationsPerBlock + i) + threadIdx.x;
+
+		PxU64 rigidId = 0x8fffffffffffffff;
+		float4 linDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
+		float4 angDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
+		if (workIndex < tNumContacts)
+		{
+			rigidId = sortedRigidIds[workIndex];
+			sRigidId[threadIdx.x] = rigidId;
+			linDeltaV = deltaV[workIndex];
+			angDeltaV = deltaV[workIndex + tNumContacts];
+		}
+		__syncthreads();
+
+		for (PxU32 reductionRadius = 1; reductionRadius < WARP_SIZE; reductionRadius <<= 1)
+		{
+			const PxU32 lane = threadIndexInWarp - reductionRadius;
+
+			float4 linVal = shuffle(FULL_MASK, linDeltaV, lane);
+			float4 angVal = shuffle(FULL_MASK, angDeltaV, lane);
+
+			//workIndex < tNumContacts guarantees that sRigidId[WARP_SIZE * warpIndex + lane]
+			//always points to initialized memory, since lane is always smaller than threadIndexInWarp.
+			if (threadIndexInWarp >= reductionRadius && workIndex < tNumContacts && 
+				rigidId == sRigidId[WARP_SIZE * warpIndex + lane])
+			{
+				linDeltaV += linVal;
+				angDeltaV += angVal;
+			}
+		}
+
+		if (threadIndexInWarp == (WARP_SIZE - 1))
+		{
+			sLinWarpAccumulator[warpIndex] = linDeltaV;
+			sAngWarpAccumulator[warpIndex] = angDeltaV;
+			sWarpRigidId[warpIndex] = rigidId;
+		}
+
+		const float4 prevLinBlockAccumulator = sLinBlockAccumulator;
+		const float4 prevAngBlockAccumulator = sAngBlockAccumulator;
+		const PxU64 prevBlockRigidId = sBlockRigidId;
+
+		//Don't allow write until we've finished all reading...
+		__syncthreads();
+
+		if (warpIndex == 0)
+		{
+			float4 linDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
+			float4 angDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
+
+			PxU64 warpRigidId = 0x8fffffffffffffff;
+			if (threadIndexInWarp < numWarpsPerBlock)
+			{
+				linDeltaV = sLinWarpAccumulator[threadIndexInWarp];
+				angDeltaV = sAngWarpAccumulator[threadIndexInWarp];
+				warpRigidId = sWarpRigidId[threadIndexInWarp];
+			}
+
+			float4 tLinDeltaV = linDeltaV;
+			float4 tAngDeltaV = angDeltaV;
+
+			for (PxU32 reductionRadius = 1; reductionRadius < numWarpsPerBlock; reductionRadius <<= 1)
+			{
+				const PxU32 lane = threadIndexInWarp - reductionRadius;
+				float4 linVal = shuffle(FULL_MASK, tLinDeltaV, lane);
+				float4 angVal = shuffle(FULL_MASK, tAngDeltaV, lane);
+
+				if (threadIndexInWarp >= reductionRadius && warpRigidId == sWarpRigidId[lane])
+				{
+					tLinDeltaV += linVal;
+					tAngDeltaV += angVal;
+				}
+			}
+
+			if (threadIndexInWarp == (numWarpsPerBlock - 1))
+			{
+				if (sBlockRigidId != warpRigidId)
+				{
+					//need to clear block accumulators in case previous iteration
+					//stored other sBlockRigidId
+					sLinBlockAccumulator = make_float4(0.f, 0.f, 0.f, 0.f);
+					sAngBlockAccumulator = make_float4(0.f, 0.f, 0.f, 0.f);
+				}
+				sLinBlockAccumulator += tLinDeltaV;
+				sAngBlockAccumulator += tAngDeltaV;
+				sBlockRigidId = warpRigidId;
+			}
+
+			sLinWarpAccumulator[threadIndexInWarp] = tLinDeltaV;
+			sAngWarpAccumulator[threadIndexInWarp] = tAngDeltaV;
+		}
+		__syncthreads();
+
+		if (workIndex < tNumContacts)
+		{
+			float4 accumLin = make_float4(0.f, 0.f, 0.f, 0.f);
+			float4 accumAng = make_float4(0.f, 0.f, 0.f, 0.f);
+
+			//if rigidId and the previous element rigid Id is the same, we need to add the previous warp accumulate velocity to
+			//the current rigid body
+			if (warpIndex > 0 && rigidId == sWarpRigidId[warpIndex - 1])
+			{
+				accumLin = sLinWarpAccumulator[warpIndex - 1];
+				accumAng = sAngWarpAccumulator[warpIndex - 1];
+			}
+
+			if (i != 0 && rigidId == prevBlockRigidId)
+			{
+				accumLin += prevLinBlockAccumulator;
+				accumAng += prevAngBlockAccumulator;
+			}
+
+			//Now output both offsets...
+			deltaV[workIndex] = linDeltaV + accumLin;
+			deltaV[workIndex + tNumContacts] = angDeltaV + accumAng;
+		}
+	}
+
+	if (threadIdx.x == 0)
+	{
+		blockDeltaV[blockIdx.x] = sLinBlockAccumulator;
+		blockDeltaV[blockIdx.x + PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA] = sAngBlockAccumulator;
+		blockRigidId[blockIdx.x] = sBlockRigidId;
+	}
+}
+
+
+//32 blocks. Each block compute the exclusive ransum for the blockOffset
+extern "C" __global__ void accumulateDeltaVRigidSecondLaunch(
+	const PxU64*								sortedRigidIds,			//input
+	const PxU32*								numContacts,			//input
+	const float4*								deltaV,					//input
+	const float4*								blockDeltaV,			//input
+	const PxU64*								blockRigidId,			//input
+	PxgPrePrepDesc*								prePrepDesc,
+	PxgSolverCoreDesc*							solverCoreDesc,
+	PxgArticulationCoreDesc*					artiCoreDesc,
+	PxgSolverSharedDesc<IterativeSolveData>*	sharedDesc,
+	const bool isTGS
+)
+{
+	__shared__ float4 sBlockLinDeltaV[PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
+	__shared__ float4 sBlockAngDeltaV[PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
+	__shared__ PxU64 sBlockRigidId[PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
+	__shared__ PxU64 sRigidId[PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA + 1];
+
+	const PxU32 tNumContacts = *numContacts;
+	const PxU32 nbBlocksRequired = (tNumContacts + blockDim.x - 1) / blockDim.x;
+	const PxU32 nbIterationsPerBlock = (nbBlocksRequired + gridDim.x - 1) / gridDim.x;
+	const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE - 1);
+
+	float4 linBlockDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
+	float4 angBlockDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
+
+	PxU64 tBlockRigidId = 0x8fffffffffffffff;
+	if (threadIdx.x < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA)
+	{
+		linBlockDeltaV = blockDeltaV[threadIdx.x];
+		angBlockDeltaV = blockDeltaV[threadIdx.x + PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
+		tBlockRigidId = blockRigidId[threadIdx.x];
+
+		sBlockLinDeltaV[threadIdx.x] = linBlockDeltaV;
+		sBlockAngDeltaV[threadIdx.x] = angBlockDeltaV;
+		sBlockRigidId[threadIdx.x] = tBlockRigidId;
+	}
+
+	__syncthreads(); //sBlockRigidId is written above and read below
+
+	float4 tLinDeltaV = linBlockDeltaV;
+	float4 tAngDeltaV = angBlockDeltaV;
+	//add on block deltaV if blockRigid id match
+	for (PxU32 reductionRadius = 1; reductionRadius < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA; reductionRadius <<= 1)
+	{
+		const PxU32 lane = threadIndexInWarp - reductionRadius;
+		float4 linVal = shuffle(FULL_MASK, tLinDeltaV, lane);
+		float4 angVal = shuffle(FULL_MASK, tAngDeltaV, lane);
+
+		if (threadIndexInWarp >= reductionRadius && tBlockRigidId == sBlockRigidId[lane])
+		{
+			tLinDeltaV += linVal;
+			tAngDeltaV += angVal;
+		}
+	}
+
+	__syncthreads(); //sBlockRigidId is read above and written below
+
+	if (threadIdx.x < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA)
+	{
+		sBlockLinDeltaV[threadIdx.x] = tLinDeltaV;
+		sBlockAngDeltaV[threadIdx.x] = tAngDeltaV;
+		sBlockRigidId[threadIdx.x] = blockRigidId[threadIdx.x];
+	}
+
+	__syncthreads();
+
+	float4* solverBodyDeltaVel = sharedDesc->iterativeData.solverBodyVelPool + solverCoreDesc->accumulatedBodyDeltaVOffset;
+	//float4* initialVel = solverCoreDesc->outSolverVelocity;
+	const PxU32 numSolverBodies = solverCoreDesc->numSolverBodies;
+
+	PxgArticulationBlockData* artiData = artiCoreDesc->mArticulationBlocks;
+	PxgArticulationBlockLinkData* artiLinkData = artiCoreDesc->mArticulationLinkBlocks;
+
+	const PxU32 maxLinks = artiCoreDesc->mMaxLinksPerArticulation;
+	
+	for (PxU32 i = 0; i < nbIterationsPerBlock; ++i)
+	{
+		__syncthreads(); //sRigidId is read and written in the same loop - read and write must be separated by syncs
+
+		const PxU32 workIndex = blockDim.x * (blockIdx.x * nbIterationsPerBlock + i) + threadIdx.x;
+
+		PxU64 rigidId = 0x8fffffffffffffff;
+		if (workIndex < tNumContacts)
+		{
+			rigidId = sortedRigidIds[workIndex];
+			if (threadIdx.x > 0)
+				sRigidId[threadIdx.x - 1] = rigidId;
+
+			if (workIndex == tNumContacts - 1)
+			{
+				sRigidId[threadIdx.x] = 0x8fffffffffffffff;
+			}
+			else if (threadIdx.x == PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA - 1)
+			{
+				// first thread in block must load neighbor particle 
+				sRigidId[threadIdx.x] = sortedRigidIds[workIndex + 1];
+			}
+		}
+
+		__syncthreads();
+
+		if (workIndex < tNumContacts)
+		{
+			float4 accumLin = make_float4(0.f, 0.f, 0.f, 0.f);
+			float4 accumAng = make_float4(0.f, 0.f, 0.f, 0.f);
+
+			if (rigidId != sRigidId[threadIdx.x])
+			{
+				float4 linVel = deltaV[workIndex];
+				float4 angVel = deltaV[workIndex + tNumContacts];
+
+				PxU64 preBlockRigidId = blockIdx.x > 0 ? sBlockRigidId[blockIdx.x - 1] : 0x8fffffffffffffff;
+
+				if (rigidId == preBlockRigidId)
+				{
+					linVel += sBlockLinDeltaV[blockIdx.x - 1];
+					angVel += sBlockAngDeltaV[blockIdx.x - 1];
+				}
+
+				//nodeIndex
+				const PxNodeIndex nodeId = reinterpret_cast<PxNodeIndex&>(rigidId);
+
+				PxU32 solverBodyIndex = 0;
+
+				if (!nodeId.isStaticBody())
+				{
+					PxU32 nodeIndex = nodeId.index();
+					solverBodyIndex = prePrepDesc->solverBodyIndices[nodeIndex];
+
+					if (nodeId.isArticulation())
+					{
+						//solverBodyIndex is the globalThreadIndex for the active articulation in the block format
+						const PxU32 blockIndex = solverBodyIndex / WARP_SIZE;
+
+						PxgArticulationBlockData& articulation = artiData[blockIndex];
+						PxgArticulationBlockLinkData* artiLinks = &artiLinkData[blockIndex * maxLinks];
+						
+						
+						const PxU32 artiIndexInBlock = solverBodyIndex % WARP_SIZE;
+
+						articulation.mStateDirty[artiIndexInBlock] = PxgArtiStateDirtyFlag::eHAS_IMPULSES;
+
+						const PxU32 linkID = nodeId.articulationLinkId();
+
+						const PxReal denom = PxMax(1.0f, linVel.w);
+						PxReal ratio = 1.f / denom;
+						linVel.w = 0.f;
+
+						//for articulation, linVel and angVel accumulate impulse
+						Cm::UnAlignedSpatialVector impulse;
+						impulse.top = PxVec3(linVel.x, linVel.y, linVel.z );
+						impulse.bottom = PxVec3(angVel.x, angVel.y, angVel.z);
+
+						impulse.top *= ratio;
+						impulse.bottom *= ratio;
+						
+						/*printf("blockIndex %i artiIndexInBlock %i linkID %i ratio %f impulse linear(%f, %f, %f) angular(%f, %f, %f)\n", blockIndex, artiIndexInBlock, linkID, ratio,
+							impulse.top.x, impulse.top.y, impulse.top.z, impulse.bottom.x, impulse.bottom.y, impulse.bottom.z);*/
+					
+						storeSpatialVector(artiLinks[linkID].mScratchImpulse, -impulse, artiIndexInBlock);
+					}
+					else
+					{
+						float4 linearVelocity = solverBodyDeltaVel[solverBodyIndex];
+						float4 angularVelocity = solverBodyDeltaVel[solverBodyIndex + numSolverBodies];
+
+						const PxReal denom = PxMax(1.0f, linVel.w);
+						PxReal ratio = 1.f / denom;
+						linVel.w = 0.f;
+
+						if (isTGS)
+						{
+							linearVelocity.x += linVel.x * ratio;
+							linearVelocity.y += linVel.y * ratio;
+							linearVelocity.z += linVel.z * ratio;
+							linearVelocity.w += angVel.x * ratio;
+							angularVelocity.x += angVel.y * ratio;
+							angularVelocity.y += angVel.z * ratio;
+							//The rest is the delta position buffer
+						}
+						else
+						{
+							/*assert(PxIsFinite(linVel.x)); assert(PxIsFinite(linVel.y)); assert(PxIsFinite(linVel.z));
+							assert(PxIsFinite(angVel.x)); assert(PxIsFinite(angVel.y)); assert(PxIsFinite(angVel.z));
+							assert(PxIsFinite(ratio));*/
+
+							/*printf("Accum linVelDelta = (%f, %f, %f, %f), angVelDelta = (%f, %f, %f, %f), ratio = %f, denom = %f, globalRelax = %f\n",
+								linVel.x, linVel.y, linVel.z, linVel.w, angVel.x, angVel.y, angVel.z, angVel.w, ratio, denom, globalRelaxationCoefficient);*/
+
+							linearVelocity += linVel * ratio;
+							angularVelocity += angVel * ratio;
+						}
+
+						solverBodyDeltaVel[solverBodyIndex] = linearVelocity;
+						solverBodyDeltaVel[solverBodyIndex + numSolverBodies] = angularVelocity;
+
+					}
+					//printf("solverBodyIndex %i\n", solverBodyIndex);
+					//printf("linearVelocity(%f, %f, %f, %f)\n", linearVelocity.x, linearVelocity.y, linearVelocity.z, linearVelocity.w);
+					//printf("angularVelocity(%f, %f, %f, %f)\n", angularVelocity.x, angularVelocity.y, angularVelocity.z, angularVelocity.w);
+				}
+			}
+		}
+	}
+}
+
+
+//32 blocks. Each block compute the exclusive ransum for the blockOffset
+extern "C" __global__ void clearDeltaVRigidSecondLaunchMulti(
+	PxU64*							sortedRigidIds,			//input
+	PxU32*							numContacts,			//input
+	PxgPrePrepDesc*					prePrepDesc,
+	PxgSolverCoreDesc*				solverCoreDesc,
+	PxgArticulationCoreDesc*		artiCoreDesc,
+	PxReal*							tempDenom
+)
+{
+	__shared__ PxU64 sRigidId[PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA + 1];
+
+	const PxU32 tNumContacts = *numContacts;
+	const PxU32 idx = threadIdx.x;
+
+	const PxU32 totalBlockRequired = (tNumContacts + (PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA - 1)) / PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA;
+	const PxU32 numIterationPerBlock = (totalBlockRequired + (PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA - 1)) / PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA;
+
+	PxgArticulationBlockLinkData* artiLinkData = artiCoreDesc->mArticulationLinkBlocks;
+
+	const PxU32 maxLinks = artiCoreDesc->mMaxLinksPerArticulation;
+
+	for (PxU32 i = 0; i < numIterationPerBlock; ++i)
+	{
+		const PxU32 workIndex = i * PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA + idx + numIterationPerBlock * blockIdx.x * blockDim.x;
+
+		PxU64 rigidId = 0x8fffffffffffffff;
+		if (workIndex < tNumContacts)
+		{
+			rigidId = sortedRigidIds[workIndex];
+			if (idx > 0)
+				sRigidId[idx - 1] = rigidId;
+
+			if (workIndex == tNumContacts - 1)
+			{
+				sRigidId[idx] = 0x8fffffffffffffff;
+			}
+			else if (threadIdx.x == PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA - 1)
+			{
+				// first thread in block must load neighbor particle 
+				sRigidId[idx] = sortedRigidIds[workIndex + 1];
+			}
+
+
+		}
+
+		__syncthreads();
+
+		if (workIndex < tNumContacts)
+		{
+			if (rigidId != sRigidId[idx])
+			{
+
+				//nodeIndex
+				const PxNodeIndex nodeId = reinterpret_cast<PxNodeIndex&>(rigidId);
+
+				PxU32 solverBodyIndex = 0;
+
+				if (!nodeId.isStaticBody())
+				{
+					PxU32 nodeIndex = nodeId.index();
+					solverBodyIndex = prePrepDesc->solverBodyIndices[nodeIndex];
+
+					if (nodeId.isArticulation())
+					{
+						//solverBodyIndex is the globalThreadIndex for the active articulation in the block format
+						const PxU32 blockIndex = solverBodyIndex / WARP_SIZE;
+
+						PxgArticulationBlockLinkData* artiLinks = &artiLinkData[blockIndex * maxLinks];
+
+						const PxU32 artiIndexInBlock = solverBodyIndex % WARP_SIZE;
+
+						const PxU32 linkID = nodeId.articulationLinkId();
+
+						artiLinks[linkID].mDeltaScale[artiIndexInBlock] = 0.f;
+					}
+					else
+					{
+						tempDenom[solverBodyIndex] = 0.f;
+					}
+				}
+			}
+		}
+	}
+}
+
+
+
+//32 blocks. Each block compute the exclusive ransum for the blockOffset
+extern "C" __global__ void accumulateDeltaVRigidSecondLaunchMultiStage1(
+	PxU64*							sortedRigidIds,			//input
+	PxU32*							numContacts,			//input
+	float4*							deltaV,					//input
+	float4*							blockDeltaV,			//input
+	PxU64*							blockRigidId,			//input
+	PxgPrePrepDesc*								prePrepDesc,
+	PxgSolverCoreDesc*							solverCoreDesc,
+	PxgArticulationCoreDesc*					artiCoreDesc,
+	PxgSolverSharedDesc<IterativeSolveData>*	sharedDesc,
+	PxReal*							tempDenom,
+	const bool useLocalRelax,
+	const float globalRelaxationCoefficient,
+	bool isTGS
+)
+{
+	__shared__ PxReal sBlockLinDeltaVW[PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
+	__shared__ PxU64 sBlockRigidId[PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
+	__shared__ PxU64 sRigidId[PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA + 1];
+
+	const PxU32 tNumContacts = *numContacts;
+	const PxU32 idx = threadIdx.x;
+
+	const PxU32 threadIndexInWarp = idx & (WARP_SIZE - 1);
+
+	const PxU32 totalBlockRequired = (tNumContacts + (PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA - 1)) / PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA;
+
+	float4 linBlockDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
+	float4 angBlockDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
+
+	PxU64 tBlockRigidId = 0x8fffffffffffffff;
+	if (idx < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA)
+	{
+		linBlockDeltaV = blockDeltaV[idx];
+		tBlockRigidId = blockRigidId[idx];
+
+		sBlockLinDeltaVW[idx] = linBlockDeltaV.w;
+		sBlockRigidId[idx] = tBlockRigidId;
+	}
+
+	__syncthreads(); //sBlockRigidId is written above and read below
+
+	PxReal tLinDeltaVW = linBlockDeltaV.w;
+	//add on block deltaV if blockRigid id match
+	for (PxU32 reductionRadius = 1; reductionRadius < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA; reductionRadius <<= 1)
+	{
+		const PxU32 lane = threadIndexInWarp - reductionRadius;
+		PxReal w = __shfl_sync(FULL_MASK, tLinDeltaVW, lane);
+
+		if (threadIndexInWarp >= reductionRadius && tBlockRigidId == sBlockRigidId[lane])
+		{
+			tLinDeltaVW += w;
+		}
+	}
+
+	__syncthreads(); //sBlockRigidId is read above and written below
+
+	if (idx < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA)
+	{
+		sBlockLinDeltaVW[idx] = tLinDeltaVW;
+		sBlockRigidId[idx] = blockRigidId[idx];
+	}
+
+	const PxU32 numIterationPerBlock = (totalBlockRequired + (PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA - 1)) / PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA;
+
+	__syncthreads();
+
+	PxgArticulationBlockLinkData* artiLinkData = artiCoreDesc->mArticulationLinkBlocks;
+
+	const PxU32 maxLinks = artiCoreDesc->mMaxLinksPerArticulation;
+
+	for (PxU32 i = 0; i < numIterationPerBlock; ++i)
+	{
+		const PxU32 workIndex = i * PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA + idx + numIterationPerBlock * blockIdx.x * blockDim.x;
+
+		PxU64 rigidId = 0x8fffffffffffffff;
+		if (workIndex < tNumContacts)
+		{
+			rigidId = sortedRigidIds[workIndex];
+			if (idx > 0)
+				sRigidId[idx - 1] = rigidId;
+
+			if (workIndex == tNumContacts - 1)
+			{
+				sRigidId[idx] = 0x8fffffffffffffff;
+			}
+			else if (threadIdx.x == PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA - 1)
+			{
+				// first thread in block must load neighbor particle 
+				sRigidId[idx] = sortedRigidIds[workIndex + 1];
+			}
+
+
+		}
+
+		__syncthreads();
+
+		if (workIndex < tNumContacts)
+		{
+			if (rigidId != sRigidId[idx])
+			{
+				PxReal linVelW = deltaV[workIndex].w;
+
+				PxU64 preBlockRigidId = blockIdx.x > 0 ? sBlockRigidId[blockIdx.x - 1] : 0x8fffffffffffffff;
+
+				if (rigidId == preBlockRigidId)
+				{
+					linVelW += sBlockLinDeltaVW[blockIdx.x - 1];
+				}
+
+				//nodeIndex
+				const PxNodeIndex nodeId = reinterpret_cast<PxNodeIndex&>(rigidId);
+
+				PxU32 solverBodyIndex = 0;
+
+				if (!nodeId.isStaticBody())
+				{
+					PxU32 nodeIndex = nodeId.index();
+					solverBodyIndex = prePrepDesc->solverBodyIndices[nodeIndex];
+
+					PxReal denom = globalRelaxationCoefficient;
+
+					if (useLocalRelax)
+						denom = PxMax(denom, linVelW);
+
+					if (nodeId.isArticulation())
+					{
+						//solverBodyIndex is the globalThreadIndex for the active articulation in the block format
+						const PxU32 blockIndex = solverBodyIndex / WARP_SIZE;
+
+						PxgArticulationBlockLinkData* artiLinks = &artiLinkData[blockIndex * maxLinks];
+
+						const PxU32 artiIndexInBlock = solverBodyIndex % WARP_SIZE;
+
+						const PxU32 linkID = nodeId.articulationLinkId();
+
+						atomicAdd(&artiLinks[linkID].mDeltaScale[artiIndexInBlock], denom);
+					}
+					else
+					{
+						atomicAdd(&tempDenom[solverBodyIndex], denom);
+						
+					}
+				}
+			}
+		}
+	}
+}
+
+//32 blocks. Each block compute the exclusive ransum for the blockOffset
+extern "C" __global__ void accumulateDeltaVRigidSecondLaunchMultiStage2(
+	PxU64*							sortedRigidIds,			//input
+	PxU32*							numContacts,			//input
+	float4*							deltaV,					//input
+	float4*							blockDeltaV,			//input
+	PxU64*							blockRigidId,			//input
+	PxgPrePrepDesc*								prePrepDesc,
+	PxgSolverCoreDesc*							solverCoreDesc,
+	PxgArticulationCoreDesc*					artiCoreDesc,
+	PxgSolverSharedDesc<IterativeSolveData>*	sharedDesc,
+	PxReal*							tempDenom,
+	const bool useLocalRelax,
+	const float globalRelaxationCoefficient,
+	bool isTGS
+)
+{
+	__shared__ float4 sBlockLinDeltaV[PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
+	__shared__ float4 sBlockAngDeltaV[PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
+	__shared__ PxU64 sBlockRigidId[PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
+	__shared__ PxU64 sRigidId[PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA + 1];
+
+	const PxU32 tNumContacts = *numContacts;
+	const PxU32 idx = threadIdx.x;
+
+	const PxU32 threadIndexInWarp = idx & (WARP_SIZE - 1);
+
+	const PxU32 totalBlockRequired = (tNumContacts + (PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA - 1)) / PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA;
+
+	float4 linBlockDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
+	float4 angBlockDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
+
+	PxU64 tBlockRigidId = 0x8fffffffffffffff;
+	if (idx < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA)
+	{
+		linBlockDeltaV = blockDeltaV[idx];
+		angBlockDeltaV = blockDeltaV[idx + PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
+		tBlockRigidId = blockRigidId[idx];
+
+		sBlockLinDeltaV[idx] = linBlockDeltaV;
+		sBlockAngDeltaV[idx] = angBlockDeltaV;
+		sBlockRigidId[idx] = tBlockRigidId;
+	}
+
+	__syncthreads(); //sBlockRigidId is written above and read below
+
+	float4 tLinDeltaV = linBlockDeltaV;
+	float4 tAngDeltaV = angBlockDeltaV;
+	//add on block deltaV if blockRigid id match
+	for (PxU32 reductionRadius = 1; reductionRadius < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA; reductionRadius <<= 1)
+	{
+		const PxU32 lane = threadIndexInWarp - reductionRadius;
+		float4 linVal = shuffle(FULL_MASK, tLinDeltaV, lane);
+		float4 angVal = shuffle(FULL_MASK, tAngDeltaV, lane);
+
+		if (threadIndexInWarp >= reductionRadius && tBlockRigidId == sBlockRigidId[lane])
+		{
+			tLinDeltaV += linVal;
+			tAngDeltaV += angVal;
+		}
+	}
+
+	__syncthreads(); //sBlockRigidId is read above and written below
+
+	if (idx < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA)
+	{
+		sBlockLinDeltaV[idx] = tLinDeltaV;
+		sBlockAngDeltaV[idx] = tAngDeltaV;
+		sBlockRigidId[idx] = blockRigidId[idx];
+	}
+
+	const PxU32 numIterationPerBlock = (totalBlockRequired + (PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA - 1)) / PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA;
+
+	__syncthreads();
+
+	float4* solverBodyDeltaVel = sharedDesc->iterativeData.solverBodyVelPool + solverCoreDesc->accumulatedBodyDeltaVOffset;
+	//float4* initialVel = solverCoreDesc->outSolverVelocity;
+	const PxU32 numSolverBodies = solverCoreDesc->numSolverBodies;
+
+	PxgArticulationBlockData* artiData = artiCoreDesc->mArticulationBlocks;
+	PxgArticulationBlockLinkData* artiLinkData = artiCoreDesc->mArticulationLinkBlocks;
+
+	const PxU32 maxLinks = artiCoreDesc->mMaxLinksPerArticulation;
+
+	for (PxU32 i = 0; i < numIterationPerBlock; ++i)
+	{
+		const PxU32 workIndex = i * PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA + idx + numIterationPerBlock * blockIdx.x * blockDim.x;
+
+		PxU64 rigidId = 0x8fffffffffffffff;
+		if (workIndex < tNumContacts)
+		{
+			rigidId = sortedRigidIds[workIndex];
+			if (idx > 0)
+				sRigidId[idx - 1] = rigidId;
+
+			if (workIndex == tNumContacts - 1)
+			{
+				sRigidId[idx] = 0x8fffffffffffffff;
+			}
+			else if (threadIdx.x == PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA - 1)
+			{
+				// first thread in block must load neighbor particle 
+				sRigidId[idx] = sortedRigidIds[workIndex + 1];
+			}
+
+
+		}
+
+		__syncthreads();
+
+		if (workIndex < tNumContacts)
+		{
+			float4 accumLin = make_float4(0.f, 0.f, 0.f, 0.f);
+			float4 accumAng = make_float4(0.f, 0.f, 0.f, 0.f);
+
+			if (rigidId != sRigidId[idx])
+			{
+				float4 linVel = deltaV[workIndex];
+				float4 angVel = deltaV[workIndex + tNumContacts];
+
+				PxU64 preBlockRigidId = blockIdx.x > 0 ? sBlockRigidId[blockIdx.x - 1] : 0x8fffffffffffffff;
+
+				if (rigidId == preBlockRigidId)
+				{
+					linVel += sBlockLinDeltaV[blockIdx.x - 1];
+					angVel += sBlockAngDeltaV[blockIdx.x - 1];
+				}
+
+				//nodeIndex
+				const PxNodeIndex nodeId = reinterpret_cast<PxNodeIndex&>(rigidId);
+
+				PxU32 solverBodyIndex = 0;
+
+				if (!nodeId.isStaticBody())
+				{
+					PxU32 nodeIndex = nodeId.index();
+					solverBodyIndex = prePrepDesc->solverBodyIndices[nodeIndex];
+
+					if (nodeId.isArticulation())
+					{
+						//solverBodyIndex is the globalThreadIndex for the active articulation in the block format
+						const PxU32 blockIndex = solverBodyIndex / WARP_SIZE;
+
+						PxgArticulationBlockData& articulation = artiData[blockIndex];
+						PxgArticulationBlockLinkData* artiLinks = &artiLinkData[blockIndex * maxLinks];
+
+
+						const PxU32 artiIndexInBlock = solverBodyIndex % WARP_SIZE;
+
+						articulation.mStateDirty[artiIndexInBlock] = PxgArtiStateDirtyFlag::eHAS_IMPULSES;
+
+						const PxU32 linkID = nodeId.articulationLinkId();
+
+						PxReal denom = artiLinks[linkID].mDeltaScale[artiIndexInBlock];
+						PxReal ratio = 1.f / denom;
+
+						//for articulation, linVel and angVel accumulate impulse
+						Cm::UnAlignedSpatialVector impulse;
+						impulse.top = PxVec3(linVel.x, linVel.y, linVel.z);
+						impulse.bottom = PxVec3(angVel.x, angVel.y, angVel.z);
+
+						impulse.top *= ratio;
+						impulse.bottom *= ratio;
+
+						/*printf("blockIndex %i artiIndexInBlock %i linkID %i ratio %f impulse linear(%f, %f, %f) angular(%f, %f, %f)\n", blockIndex, artiIndexInBlock, linkID, ratio,
+							impulse.top.x, impulse.top.y, impulse.top.z, impulse.bottom.x, impulse.bottom.y, impulse.bottom.z);*/
+
+						atomicAddSpatialVector(artiLinks[linkID].mScratchImpulse, -impulse, artiIndexInBlock);
+					}
+					else
+					{
+						PxReal denom = tempDenom[solverBodyIndex];// globalRelaxationCoefficient;
+
+						PxReal ratio = 1.f / denom;
+
+						if (isTGS)
+						{
+							atomicAdd(&solverBodyDeltaVel[solverBodyIndex].x, linVel.x*ratio);
+							atomicAdd(&solverBodyDeltaVel[solverBodyIndex].y, linVel.y*ratio);
+							atomicAdd(&solverBodyDeltaVel[solverBodyIndex].z, linVel.z*ratio);
+							atomicAdd(&solverBodyDeltaVel[solverBodyIndex].w, angVel.x*ratio);
+							atomicAdd(&solverBodyDeltaVel[solverBodyIndex + numSolverBodies].x, angVel.y*ratio);
+							atomicAdd(&solverBodyDeltaVel[solverBodyIndex + numSolverBodies].y, angVel.z*ratio);
+							//The rest is the delta position buffer
+						}
+						else
+						{
+
+							atomicAdd(&solverBodyDeltaVel[solverBodyIndex].x, linVel.x*ratio);
+							atomicAdd(&solverBodyDeltaVel[solverBodyIndex].y, linVel.y*ratio);
+							atomicAdd(&solverBodyDeltaVel[solverBodyIndex].z, linVel.z*ratio);
+							atomicAdd(&solverBodyDeltaVel[solverBodyIndex + numSolverBodies].x, angVel.x*ratio);
+							atomicAdd(&solverBodyDeltaVel[solverBodyIndex + numSolverBodies].y, angVel.y*ratio);
+							atomicAdd(&solverBodyDeltaVel[solverBodyIndex + numSolverBodies].z, angVel.z*ratio);
+
+						}
+					}
+					//printf("solverBodyIndex %i\n", solverBodyIndex);
+					//printf("linearVelocity(%f, %f, %f, %f)\n", linearVelocity.x, linearVelocity.y, linearVelocity.z, linearVelocity.w);
+					//printf("angularVelocity(%f, %f, %f, %f)\n", angularVelocity.x, angularVelocity.y, angularVelocity.z, angularVelocity.w);
+				}
+			}
+		}
+	}
+}
+