// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of NVIDIA CORPORATION nor the names of its
//    contributors may be used to endorse or promote products derived
//    from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  

#include "vector_types.h"
#include "foundation/PxVec3.h"
#include "cutil_math.h"
#include "PxgParticleSystemCoreKernelIndices.h"
#include "reduction.cuh"
#include "shuffle.cuh"
#include "PxgSolverCoreDesc.h"
#include "PxNodeIndex.h"
#include "assert.h"
#include "PxgSimulationCoreDesc.h"
#include "PxgArticulationCoreDesc.h"

using namespace physx;

extern "C" __host__ void initSimulationControllerKernels2() {}

/*
 * This kernel takes a *sorted* list of PxNodeIndex (of size *numContacts), 
 * and a corresponding deltaV list.
 * 
 * deltaV is updated with the cumulativ delta values, such that for each last entry of a rigid body deltaV is the total sum 
 * for that rigid body - *however* only for the rigid body entries which are processed within one block.
 * blockRigidId, blockDeltaV are updated, such that entry represents the deltaV sum and rigid body ID, of the last
 * occuring rigid body in that corresponding block. This can then be used in the subsequent kernel to complete the sum 
 * for rigid bodies, which entries are overlapping a block.
 */
extern "C" __global__ void accumulateDeltaVRigidFirstLaunch(
	const PxU64*					sortedRigidIds,			//input
	const PxU32*					numContacts,			//input
	float4*							deltaV,					//input/output
	float4*							blockDeltaV,			//output
	PxU64*							blockRigidId			//output
)
{
	__shared__ PxU64 sRigidId[PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA + 1];

	//numWarpsPerBlock can't be larger than 32
	const PxU32 numWarpsPerBlock = PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA / WARP_SIZE;

	__shared__ float4 sLinWarpAccumulator[WARP_SIZE];
	__shared__ float4 sAngWarpAccumulator[WARP_SIZE];
	__shared__ PxU64  sWarpRigidId[WARP_SIZE];
	__shared__ float4 sLinBlockAccumulator;
	__shared__ float4 sAngBlockAccumulator;
	__shared__ PxU64 sBlockRigidId;

	const PxU32 tNumContacts = *numContacts;
	const PxU32 nbBlocksRequired = (tNumContacts + blockDim.x - 1) / blockDim.x;
	const PxU32 nbIterationsPerBlock = (nbBlocksRequired + gridDim.x - 1) / gridDim.x;
	const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE - 1);
	const PxU32 warpIndex = threadIdx.x / WARP_SIZE;

	if (threadIdx.x < 4)
	{
		float* tLinBlockAccumulator = reinterpret_cast<float*>(&sLinBlockAccumulator.x);
		tLinBlockAccumulator[threadIdx.x] = 0.f;

		float* tAngBlockAccumulator = reinterpret_cast<float*>(&sAngBlockAccumulator.x);
		tAngBlockAccumulator[threadIdx.x] = 0.f;

		if (threadIdx.x == 0)
			sBlockRigidId = 0x8fffffffffffffff;
	}
	__syncthreads();

	for (PxU32 i = 0; i < nbIterationsPerBlock; ++i)
	{
		const PxU32 workIndex = blockDim.x*(blockIdx.x*nbIterationsPerBlock + i) + threadIdx.x;

		PxU64 rigidId = 0x8fffffffffffffff;
		float4 linDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
		float4 angDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
		if (workIndex < tNumContacts)
		{
			rigidId = sortedRigidIds[workIndex];
			sRigidId[threadIdx.x] = rigidId;
			linDeltaV = deltaV[workIndex];
			angDeltaV = deltaV[workIndex + tNumContacts];
		}
		__syncthreads();

		for (PxU32 reductionRadius = 1; reductionRadius < WARP_SIZE; reductionRadius <<= 1)
		{
			const PxU32 lane = threadIndexInWarp - reductionRadius;

			float4 linVal = shuffle(FULL_MASK, linDeltaV, lane);
			float4 angVal = shuffle(FULL_MASK, angDeltaV, lane);

			//workIndex < tNumContacts guarantees that sRigidId[WARP_SIZE * warpIndex + lane]
			//always points to initialized memory, since lane is always smaller than threadIndexInWarp.
			if (threadIndexInWarp >= reductionRadius && workIndex < tNumContacts && 
				rigidId == sRigidId[WARP_SIZE * warpIndex + lane])
			{
				linDeltaV += linVal;
				angDeltaV += angVal;
			}
		}

		if (threadIndexInWarp == (WARP_SIZE - 1))
		{
			sLinWarpAccumulator[warpIndex] = linDeltaV;
			sAngWarpAccumulator[warpIndex] = angDeltaV;
			sWarpRigidId[warpIndex] = rigidId;
		}

		const float4 prevLinBlockAccumulator = sLinBlockAccumulator;
		const float4 prevAngBlockAccumulator = sAngBlockAccumulator;
		const PxU64 prevBlockRigidId = sBlockRigidId;

		//Don't allow write until we've finished all reading...
		__syncthreads();

		if (warpIndex == 0)
		{
			float4 linDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
			float4 angDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);

			PxU64 warpRigidId = 0x8fffffffffffffff;
			if (threadIndexInWarp < numWarpsPerBlock)
			{
				linDeltaV = sLinWarpAccumulator[threadIndexInWarp];
				angDeltaV = sAngWarpAccumulator[threadIndexInWarp];
				warpRigidId = sWarpRigidId[threadIndexInWarp];
			}

			float4 tLinDeltaV = linDeltaV;
			float4 tAngDeltaV = angDeltaV;

			for (PxU32 reductionRadius = 1; reductionRadius < numWarpsPerBlock; reductionRadius <<= 1)
			{
				const PxU32 lane = threadIndexInWarp - reductionRadius;
				float4 linVal = shuffle(FULL_MASK, tLinDeltaV, lane);
				float4 angVal = shuffle(FULL_MASK, tAngDeltaV, lane);

				if (threadIndexInWarp >= reductionRadius && warpRigidId == sWarpRigidId[lane])
				{
					tLinDeltaV += linVal;
					tAngDeltaV += angVal;
				}
			}

			if (threadIndexInWarp == (numWarpsPerBlock - 1))
			{
				if (sBlockRigidId != warpRigidId)
				{
					//need to clear block accumulators in case previous iteration
					//stored other sBlockRigidId
					sLinBlockAccumulator = make_float4(0.f, 0.f, 0.f, 0.f);
					sAngBlockAccumulator = make_float4(0.f, 0.f, 0.f, 0.f);
				}
				sLinBlockAccumulator += tLinDeltaV;
				sAngBlockAccumulator += tAngDeltaV;
				sBlockRigidId = warpRigidId;
			}

			sLinWarpAccumulator[threadIndexInWarp] = tLinDeltaV;
			sAngWarpAccumulator[threadIndexInWarp] = tAngDeltaV;
		}
		__syncthreads();

		if (workIndex < tNumContacts)
		{
			float4 accumLin = make_float4(0.f, 0.f, 0.f, 0.f);
			float4 accumAng = make_float4(0.f, 0.f, 0.f, 0.f);

			//if rigidId and the previous element rigid Id is the same, we need to add the previous warp accumulate velocity to
			//the current rigid body
			if (warpIndex > 0 && rigidId == sWarpRigidId[warpIndex - 1])
			{
				accumLin = sLinWarpAccumulator[warpIndex - 1];
				accumAng = sAngWarpAccumulator[warpIndex - 1];
			}

			if (i != 0 && rigidId == prevBlockRigidId)
			{
				accumLin += prevLinBlockAccumulator;
				accumAng += prevAngBlockAccumulator;
			}

			//Now output both offsets...
			deltaV[workIndex] = linDeltaV + accumLin;
			deltaV[workIndex + tNumContacts] = angDeltaV + accumAng;
		}
	}

	if (threadIdx.x == 0)
	{
		blockDeltaV[blockIdx.x] = sLinBlockAccumulator;
		blockDeltaV[blockIdx.x + PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA] = sAngBlockAccumulator;
		blockRigidId[blockIdx.x] = sBlockRigidId;
	}
}


//32 blocks. Each block compute the exclusive ransum for the blockOffset
extern "C" __global__ void accumulateDeltaVRigidSecondLaunch(
	const PxU64*								sortedRigidIds,			//input
	const PxU32*								numContacts,			//input
	const float4*								deltaV,					//input
	const float4*								blockDeltaV,			//input
	const PxU64*								blockRigidId,			//input
	PxgPrePrepDesc*								prePrepDesc,
	PxgSolverCoreDesc*							solverCoreDesc,
	PxgArticulationCoreDesc*					artiCoreDesc,
	PxgSolverSharedDesc<IterativeSolveData>*	sharedDesc,
	const bool isTGS
)
{
	__shared__ float4 sBlockLinDeltaV[PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
	__shared__ float4 sBlockAngDeltaV[PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
	__shared__ PxU64 sBlockRigidId[PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
	__shared__ PxU64 sRigidId[PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA + 1];

	const PxU32 tNumContacts = *numContacts;
	const PxU32 nbBlocksRequired = (tNumContacts + blockDim.x - 1) / blockDim.x;
	const PxU32 nbIterationsPerBlock = (nbBlocksRequired + gridDim.x - 1) / gridDim.x;
	const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE - 1);

	float4 linBlockDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
	float4 angBlockDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);

	PxU64 tBlockRigidId = 0x8fffffffffffffff;
	if (threadIdx.x < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA)
	{
		linBlockDeltaV = blockDeltaV[threadIdx.x];
		angBlockDeltaV = blockDeltaV[threadIdx.x + PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
		tBlockRigidId = blockRigidId[threadIdx.x];

		sBlockLinDeltaV[threadIdx.x] = linBlockDeltaV;
		sBlockAngDeltaV[threadIdx.x] = angBlockDeltaV;
		sBlockRigidId[threadIdx.x] = tBlockRigidId;
	}

	__syncthreads(); //sBlockRigidId is written above and read below

	float4 tLinDeltaV = linBlockDeltaV;
	float4 tAngDeltaV = angBlockDeltaV;
	//add on block deltaV if blockRigid id match
	for (PxU32 reductionRadius = 1; reductionRadius < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA; reductionRadius <<= 1)
	{
		const PxU32 lane = threadIndexInWarp - reductionRadius;
		float4 linVal = shuffle(FULL_MASK, tLinDeltaV, lane);
		float4 angVal = shuffle(FULL_MASK, tAngDeltaV, lane);

		if (threadIndexInWarp >= reductionRadius && tBlockRigidId == sBlockRigidId[lane])
		{
			tLinDeltaV += linVal;
			tAngDeltaV += angVal;
		}
	}

	__syncthreads(); //sBlockRigidId is read above and written below

	if (threadIdx.x < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA)
	{
		sBlockLinDeltaV[threadIdx.x] = tLinDeltaV;
		sBlockAngDeltaV[threadIdx.x] = tAngDeltaV;
		sBlockRigidId[threadIdx.x] = blockRigidId[threadIdx.x];
	}

	__syncthreads();

	float4* solverBodyDeltaVel = sharedDesc->iterativeData.solverBodyVelPool + solverCoreDesc->accumulatedBodyDeltaVOffset;
	//float4* initialVel = solverCoreDesc->outSolverVelocity;
	const PxU32 numSolverBodies = solverCoreDesc->numSolverBodies;

	PxgArticulationBlockData* artiData = artiCoreDesc->mArticulationBlocks;
	PxgArticulationBlockLinkData* artiLinkData = artiCoreDesc->mArticulationLinkBlocks;

	const PxU32 maxLinks = artiCoreDesc->mMaxLinksPerArticulation;
	
	for (PxU32 i = 0; i < nbIterationsPerBlock; ++i)
	{
		__syncthreads(); //sRigidId is read and written in the same loop - read and write must be separated by syncs

		const PxU32 workIndex = blockDim.x * (blockIdx.x * nbIterationsPerBlock + i) + threadIdx.x;

		PxU64 rigidId = 0x8fffffffffffffff;
		if (workIndex < tNumContacts)
		{
			rigidId = sortedRigidIds[workIndex];
			if (threadIdx.x > 0)
				sRigidId[threadIdx.x - 1] = rigidId;

			if (workIndex == tNumContacts - 1)
			{
				sRigidId[threadIdx.x] = 0x8fffffffffffffff;
			}
			else if (threadIdx.x == PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA - 1)
			{
				// first thread in block must load neighbor particle 
				sRigidId[threadIdx.x] = sortedRigidIds[workIndex + 1];
			}
		}

		__syncthreads();

		if (workIndex < tNumContacts)
		{
			float4 accumLin = make_float4(0.f, 0.f, 0.f, 0.f);
			float4 accumAng = make_float4(0.f, 0.f, 0.f, 0.f);

			if (rigidId != sRigidId[threadIdx.x])
			{
				float4 linVel = deltaV[workIndex];
				float4 angVel = deltaV[workIndex + tNumContacts];

				PxU64 preBlockRigidId = blockIdx.x > 0 ? sBlockRigidId[blockIdx.x - 1] : 0x8fffffffffffffff;

				if (rigidId == preBlockRigidId)
				{
					linVel += sBlockLinDeltaV[blockIdx.x - 1];
					angVel += sBlockAngDeltaV[blockIdx.x - 1];
				}

				//nodeIndex
				const PxNodeIndex nodeId = reinterpret_cast<PxNodeIndex&>(rigidId);

				PxU32 solverBodyIndex = 0;

				if (!nodeId.isStaticBody())
				{
					PxU32 nodeIndex = nodeId.index();
					solverBodyIndex = prePrepDesc->solverBodyIndices[nodeIndex];

					if (nodeId.isArticulation())
					{
						//solverBodyIndex is the globalThreadIndex for the active articulation in the block format
						const PxU32 blockIndex = solverBodyIndex / WARP_SIZE;

						PxgArticulationBlockData& articulation = artiData[blockIndex];
						PxgArticulationBlockLinkData* artiLinks = &artiLinkData[blockIndex * maxLinks];
						
						
						const PxU32 artiIndexInBlock = solverBodyIndex % WARP_SIZE;

						articulation.mStateDirty[artiIndexInBlock] = PxgArtiStateDirtyFlag::eHAS_IMPULSES;

						const PxU32 linkID = nodeId.articulationLinkId();

						const PxReal denom = PxMax(1.0f, linVel.w);
						PxReal ratio = 1.f / denom;
						linVel.w = 0.f;

						//for articulation, linVel and angVel accumulate impulse
						Cm::UnAlignedSpatialVector impulse;
						impulse.top = PxVec3(linVel.x, linVel.y, linVel.z );
						impulse.bottom = PxVec3(angVel.x, angVel.y, angVel.z);

						impulse.top *= ratio;
						impulse.bottom *= ratio;
						
						/*printf("blockIndex %i artiIndexInBlock %i linkID %i ratio %f impulse linear(%f, %f, %f) angular(%f, %f, %f)\n", blockIndex, artiIndexInBlock, linkID, ratio,
							impulse.top.x, impulse.top.y, impulse.top.z, impulse.bottom.x, impulse.bottom.y, impulse.bottom.z);*/
					
						storeSpatialVector(artiLinks[linkID].mScratchImpulse, -impulse, artiIndexInBlock);
					}
					else
					{
						float4 linearVelocity = solverBodyDeltaVel[solverBodyIndex];
						float4 angularVelocity = solverBodyDeltaVel[solverBodyIndex + numSolverBodies];

						const PxReal denom = PxMax(1.0f, linVel.w);
						PxReal ratio = 1.f / denom;
						linVel.w = 0.f;

						if (isTGS)
						{
							linearVelocity.x += linVel.x * ratio;
							linearVelocity.y += linVel.y * ratio;
							linearVelocity.z += linVel.z * ratio;
							linearVelocity.w += angVel.x * ratio;
							angularVelocity.x += angVel.y * ratio;
							angularVelocity.y += angVel.z * ratio;
							//The rest is the delta position buffer
						}
						else
						{
							/*assert(PxIsFinite(linVel.x)); assert(PxIsFinite(linVel.y)); assert(PxIsFinite(linVel.z));
							assert(PxIsFinite(angVel.x)); assert(PxIsFinite(angVel.y)); assert(PxIsFinite(angVel.z));
							assert(PxIsFinite(ratio));*/

							/*printf("Accum linVelDelta = (%f, %f, %f, %f), angVelDelta = (%f, %f, %f, %f), ratio = %f, denom = %f, globalRelax = %f\n",
								linVel.x, linVel.y, linVel.z, linVel.w, angVel.x, angVel.y, angVel.z, angVel.w, ratio, denom, globalRelaxationCoefficient);*/

							linearVelocity += linVel * ratio;
							angularVelocity += angVel * ratio;
						}

						solverBodyDeltaVel[solverBodyIndex] = linearVelocity;
						solverBodyDeltaVel[solverBodyIndex + numSolverBodies] = angularVelocity;

					}
					//printf("solverBodyIndex %i\n", solverBodyIndex);
					//printf("linearVelocity(%f, %f, %f, %f)\n", linearVelocity.x, linearVelocity.y, linearVelocity.z, linearVelocity.w);
					//printf("angularVelocity(%f, %f, %f, %f)\n", angularVelocity.x, angularVelocity.y, angularVelocity.z, angularVelocity.w);
				}
			}
		}
	}
}


//32 blocks. Each block compute the exclusive ransum for the blockOffset
extern "C" __global__ void clearDeltaVRigidSecondLaunchMulti(
	PxU64*							sortedRigidIds,			//input
	PxU32*							numContacts,			//input
	PxgPrePrepDesc*					prePrepDesc,
	PxgSolverCoreDesc*				solverCoreDesc,
	PxgArticulationCoreDesc*		artiCoreDesc,
	PxReal*							tempDenom
)
{
	__shared__ PxU64 sRigidId[PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA + 1];

	const PxU32 tNumContacts = *numContacts;
	const PxU32 idx = threadIdx.x;

	const PxU32 totalBlockRequired = (tNumContacts + (PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA - 1)) / PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA;
	const PxU32 numIterationPerBlock = (totalBlockRequired + (PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA - 1)) / PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA;

	PxgArticulationBlockLinkData* artiLinkData = artiCoreDesc->mArticulationLinkBlocks;

	const PxU32 maxLinks = artiCoreDesc->mMaxLinksPerArticulation;

	for (PxU32 i = 0; i < numIterationPerBlock; ++i)
	{
		const PxU32 workIndex = i * PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA + idx + numIterationPerBlock * blockIdx.x * blockDim.x;

		PxU64 rigidId = 0x8fffffffffffffff;
		if (workIndex < tNumContacts)
		{
			rigidId = sortedRigidIds[workIndex];
			if (idx > 0)
				sRigidId[idx - 1] = rigidId;

			if (workIndex == tNumContacts - 1)
			{
				sRigidId[idx] = 0x8fffffffffffffff;
			}
			else if (threadIdx.x == PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA - 1)
			{
				// first thread in block must load neighbor particle 
				sRigidId[idx] = sortedRigidIds[workIndex + 1];
			}


		}

		__syncthreads();

		if (workIndex < tNumContacts)
		{
			if (rigidId != sRigidId[idx])
			{

				//nodeIndex
				const PxNodeIndex nodeId = reinterpret_cast<PxNodeIndex&>(rigidId);

				PxU32 solverBodyIndex = 0;

				if (!nodeId.isStaticBody())
				{
					PxU32 nodeIndex = nodeId.index();
					solverBodyIndex = prePrepDesc->solverBodyIndices[nodeIndex];

					if (nodeId.isArticulation())
					{
						//solverBodyIndex is the globalThreadIndex for the active articulation in the block format
						const PxU32 blockIndex = solverBodyIndex / WARP_SIZE;

						PxgArticulationBlockLinkData* artiLinks = &artiLinkData[blockIndex * maxLinks];

						const PxU32 artiIndexInBlock = solverBodyIndex % WARP_SIZE;

						const PxU32 linkID = nodeId.articulationLinkId();

						artiLinks[linkID].mDeltaScale[artiIndexInBlock] = 0.f;
					}
					else
					{
						tempDenom[solverBodyIndex] = 0.f;
					}
				}
			}
		}
	}
}


//32 blocks. Each block compute the exclusive ransum for the blockOffset
extern "C" __global__ void accumulateDeltaVRigidSecondLaunchMultiStage1(
	PxU64*							sortedRigidIds,			//input
	PxU32*							numContacts,			//input
	float4*							deltaV,					//input
	float4*							blockDeltaV,			//input
	PxU64*							blockRigidId,			//input
	PxgPrePrepDesc*								prePrepDesc,
	PxgSolverCoreDesc*							solverCoreDesc,
	PxgArticulationCoreDesc*					artiCoreDesc,
	PxgSolverSharedDesc<IterativeSolveData>*	sharedDesc,
	PxReal*							tempDenom,
	const bool useLocalRelax,
	const float globalRelaxationCoefficient,
	bool isTGS
)
{
	__shared__ PxReal sBlockLinDeltaVW[PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
	__shared__ PxU64 sBlockRigidId[PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
	__shared__ PxU64 sRigidId[PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA + 1];

	const PxU32 tNumContacts = *numContacts;
	const PxU32 idx = threadIdx.x;

	const PxU32 threadIndexInWarp = idx & (WARP_SIZE - 1);

	const PxU32 totalBlockRequired = (tNumContacts + (PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA - 1)) / PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA;

	float4 linBlockDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
	float4 angBlockDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);

	PxU64 tBlockRigidId = 0x8fffffffffffffff;
	if (idx < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA)
	{
		linBlockDeltaV = blockDeltaV[idx];
		tBlockRigidId = blockRigidId[idx];

		sBlockLinDeltaVW[idx] = linBlockDeltaV.w;
		sBlockRigidId[idx] = tBlockRigidId;
	}

	__syncthreads(); //sBlockRigidId is written above and read below

	PxReal tLinDeltaVW = linBlockDeltaV.w;
	//add on block deltaV if blockRigid id match
	for (PxU32 reductionRadius = 1; reductionRadius < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA; reductionRadius <<= 1)
	{
		const PxU32 lane = threadIndexInWarp - reductionRadius;
		PxReal w = __shfl_sync(FULL_MASK, tLinDeltaVW, lane);

		if (threadIndexInWarp >= reductionRadius && tBlockRigidId == sBlockRigidId[lane])
		{
			tLinDeltaVW += w;
		}
	}

	__syncthreads(); //sBlockRigidId is read above and written below

	if (idx < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA)
	{
		sBlockLinDeltaVW[idx] = tLinDeltaVW;
		sBlockRigidId[idx] = blockRigidId[idx];
	}

	const PxU32 numIterationPerBlock = (totalBlockRequired + (PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA - 1)) / PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA;

	__syncthreads();

	PxgArticulationBlockLinkData* artiLinkData = artiCoreDesc->mArticulationLinkBlocks;

	const PxU32 maxLinks = artiCoreDesc->mMaxLinksPerArticulation;

	for (PxU32 i = 0; i < numIterationPerBlock; ++i)
	{
		const PxU32 workIndex = i * PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA + idx + numIterationPerBlock * blockIdx.x * blockDim.x;

		PxU64 rigidId = 0x8fffffffffffffff;
		if (workIndex < tNumContacts)
		{
			rigidId = sortedRigidIds[workIndex];
			if (idx > 0)
				sRigidId[idx - 1] = rigidId;

			if (workIndex == tNumContacts - 1)
			{
				sRigidId[idx] = 0x8fffffffffffffff;
			}
			else if (threadIdx.x == PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA - 1)
			{
				// first thread in block must load neighbor particle 
				sRigidId[idx] = sortedRigidIds[workIndex + 1];
			}


		}

		__syncthreads();

		if (workIndex < tNumContacts)
		{
			if (rigidId != sRigidId[idx])
			{
				PxReal linVelW = deltaV[workIndex].w;

				PxU64 preBlockRigidId = blockIdx.x > 0 ? sBlockRigidId[blockIdx.x - 1] : 0x8fffffffffffffff;

				if (rigidId == preBlockRigidId)
				{
					linVelW += sBlockLinDeltaVW[blockIdx.x - 1];
				}

				//nodeIndex
				const PxNodeIndex nodeId = reinterpret_cast<PxNodeIndex&>(rigidId);

				PxU32 solverBodyIndex = 0;

				if (!nodeId.isStaticBody())
				{
					PxU32 nodeIndex = nodeId.index();
					solverBodyIndex = prePrepDesc->solverBodyIndices[nodeIndex];

					PxReal denom = globalRelaxationCoefficient;

					if (useLocalRelax)
						denom = PxMax(denom, linVelW);

					if (nodeId.isArticulation())
					{
						//solverBodyIndex is the globalThreadIndex for the active articulation in the block format
						const PxU32 blockIndex = solverBodyIndex / WARP_SIZE;

						PxgArticulationBlockLinkData* artiLinks = &artiLinkData[blockIndex * maxLinks];

						const PxU32 artiIndexInBlock = solverBodyIndex % WARP_SIZE;

						const PxU32 linkID = nodeId.articulationLinkId();

						atomicAdd(&artiLinks[linkID].mDeltaScale[artiIndexInBlock], denom);
					}
					else
					{
						atomicAdd(&tempDenom[solverBodyIndex], denom);
						
					}
				}
			}
		}
	}
}

//32 blocks. Each block compute the exclusive ransum for the blockOffset
extern "C" __global__ void accumulateDeltaVRigidSecondLaunchMultiStage2(
	PxU64*							sortedRigidIds,			//input
	PxU32*							numContacts,			//input
	float4*							deltaV,					//input
	float4*							blockDeltaV,			//input
	PxU64*							blockRigidId,			//input
	PxgPrePrepDesc*								prePrepDesc,
	PxgSolverCoreDesc*							solverCoreDesc,
	PxgArticulationCoreDesc*					artiCoreDesc,
	PxgSolverSharedDesc<IterativeSolveData>*	sharedDesc,
	PxReal*							tempDenom,
	const bool useLocalRelax,
	const float globalRelaxationCoefficient,
	bool isTGS
)
{
	__shared__ float4 sBlockLinDeltaV[PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
	__shared__ float4 sBlockAngDeltaV[PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
	__shared__ PxU64 sBlockRigidId[PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
	__shared__ PxU64 sRigidId[PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA + 1];

	const PxU32 tNumContacts = *numContacts;
	const PxU32 idx = threadIdx.x;

	const PxU32 threadIndexInWarp = idx & (WARP_SIZE - 1);

	const PxU32 totalBlockRequired = (tNumContacts + (PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA - 1)) / PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA;

	float4 linBlockDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
	float4 angBlockDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);

	PxU64 tBlockRigidId = 0x8fffffffffffffff;
	if (idx < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA)
	{
		linBlockDeltaV = blockDeltaV[idx];
		angBlockDeltaV = blockDeltaV[idx + PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
		tBlockRigidId = blockRigidId[idx];

		sBlockLinDeltaV[idx] = linBlockDeltaV;
		sBlockAngDeltaV[idx] = angBlockDeltaV;
		sBlockRigidId[idx] = tBlockRigidId;
	}

	__syncthreads(); //sBlockRigidId is written above and read below

	float4 tLinDeltaV = linBlockDeltaV;
	float4 tAngDeltaV = angBlockDeltaV;
	//add on block deltaV if blockRigid id match
	for (PxU32 reductionRadius = 1; reductionRadius < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA; reductionRadius <<= 1)
	{
		const PxU32 lane = threadIndexInWarp - reductionRadius;
		float4 linVal = shuffle(FULL_MASK, tLinDeltaV, lane);
		float4 angVal = shuffle(FULL_MASK, tAngDeltaV, lane);

		if (threadIndexInWarp >= reductionRadius && tBlockRigidId == sBlockRigidId[lane])
		{
			tLinDeltaV += linVal;
			tAngDeltaV += angVal;
		}
	}

	__syncthreads(); //sBlockRigidId is read above and written below

	if (idx < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA)
	{
		sBlockLinDeltaV[idx] = tLinDeltaV;
		sBlockAngDeltaV[idx] = tAngDeltaV;
		sBlockRigidId[idx] = blockRigidId[idx];
	}

	const PxU32 numIterationPerBlock = (totalBlockRequired + (PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA - 1)) / PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA;

	__syncthreads();

	float4* solverBodyDeltaVel = sharedDesc->iterativeData.solverBodyVelPool + solverCoreDesc->accumulatedBodyDeltaVOffset;
	//float4* initialVel = solverCoreDesc->outSolverVelocity;
	const PxU32 numSolverBodies = solverCoreDesc->numSolverBodies;

	PxgArticulationBlockData* artiData = artiCoreDesc->mArticulationBlocks;
	PxgArticulationBlockLinkData* artiLinkData = artiCoreDesc->mArticulationLinkBlocks;

	const PxU32 maxLinks = artiCoreDesc->mMaxLinksPerArticulation;

	for (PxU32 i = 0; i < numIterationPerBlock; ++i)
	{
		const PxU32 workIndex = i * PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA + idx + numIterationPerBlock * blockIdx.x * blockDim.x;

		PxU64 rigidId = 0x8fffffffffffffff;
		if (workIndex < tNumContacts)
		{
			rigidId = sortedRigidIds[workIndex];
			if (idx > 0)
				sRigidId[idx - 1] = rigidId;

			if (workIndex == tNumContacts - 1)
			{
				sRigidId[idx] = 0x8fffffffffffffff;
			}
			else if (threadIdx.x == PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA - 1)
			{
				// first thread in block must load neighbor particle 
				sRigidId[idx] = sortedRigidIds[workIndex + 1];
			}


		}

		__syncthreads();

		if (workIndex < tNumContacts)
		{
			float4 accumLin = make_float4(0.f, 0.f, 0.f, 0.f);
			float4 accumAng = make_float4(0.f, 0.f, 0.f, 0.f);

			if (rigidId != sRigidId[idx])
			{
				float4 linVel = deltaV[workIndex];
				float4 angVel = deltaV[workIndex + tNumContacts];

				PxU64 preBlockRigidId = blockIdx.x > 0 ? sBlockRigidId[blockIdx.x - 1] : 0x8fffffffffffffff;

				if (rigidId == preBlockRigidId)
				{
					linVel += sBlockLinDeltaV[blockIdx.x - 1];
					angVel += sBlockAngDeltaV[blockIdx.x - 1];
				}

				//nodeIndex
				const PxNodeIndex nodeId = reinterpret_cast<PxNodeIndex&>(rigidId);

				PxU32 solverBodyIndex = 0;

				if (!nodeId.isStaticBody())
				{
					PxU32 nodeIndex = nodeId.index();
					solverBodyIndex = prePrepDesc->solverBodyIndices[nodeIndex];

					if (nodeId.isArticulation())
					{
						//solverBodyIndex is the globalThreadIndex for the active articulation in the block format
						const PxU32 blockIndex = solverBodyIndex / WARP_SIZE;

						PxgArticulationBlockData& articulation = artiData[blockIndex];
						PxgArticulationBlockLinkData* artiLinks = &artiLinkData[blockIndex * maxLinks];


						const PxU32 artiIndexInBlock = solverBodyIndex % WARP_SIZE;

						articulation.mStateDirty[artiIndexInBlock] = PxgArtiStateDirtyFlag::eHAS_IMPULSES;

						const PxU32 linkID = nodeId.articulationLinkId();

						PxReal denom = artiLinks[linkID].mDeltaScale[artiIndexInBlock];
						PxReal ratio = 1.f / denom;

						//for articulation, linVel and angVel accumulate impulse
						Cm::UnAlignedSpatialVector impulse;
						impulse.top = PxVec3(linVel.x, linVel.y, linVel.z);
						impulse.bottom = PxVec3(angVel.x, angVel.y, angVel.z);

						impulse.top *= ratio;
						impulse.bottom *= ratio;

						/*printf("blockIndex %i artiIndexInBlock %i linkID %i ratio %f impulse linear(%f, %f, %f) angular(%f, %f, %f)\n", blockIndex, artiIndexInBlock, linkID, ratio,
							impulse.top.x, impulse.top.y, impulse.top.z, impulse.bottom.x, impulse.bottom.y, impulse.bottom.z);*/

						atomicAddSpatialVector(artiLinks[linkID].mScratchImpulse, -impulse, artiIndexInBlock);
					}
					else
					{
						PxReal denom = tempDenom[solverBodyIndex];// globalRelaxationCoefficient;

						PxReal ratio = 1.f / denom;

						if (isTGS)
						{
							atomicAdd(&solverBodyDeltaVel[solverBodyIndex].x, linVel.x*ratio);
							atomicAdd(&solverBodyDeltaVel[solverBodyIndex].y, linVel.y*ratio);
							atomicAdd(&solverBodyDeltaVel[solverBodyIndex].z, linVel.z*ratio);
							atomicAdd(&solverBodyDeltaVel[solverBodyIndex].w, angVel.x*ratio);
							atomicAdd(&solverBodyDeltaVel[solverBodyIndex + numSolverBodies].x, angVel.y*ratio);
							atomicAdd(&solverBodyDeltaVel[solverBodyIndex + numSolverBodies].y, angVel.z*ratio);
							//The rest is the delta position buffer
						}
						else
						{

							atomicAdd(&solverBodyDeltaVel[solverBodyIndex].x, linVel.x*ratio);
							atomicAdd(&solverBodyDeltaVel[solverBodyIndex].y, linVel.y*ratio);
							atomicAdd(&solverBodyDeltaVel[solverBodyIndex].z, linVel.z*ratio);
							atomicAdd(&solverBodyDeltaVel[solverBodyIndex + numSolverBodies].x, angVel.x*ratio);
							atomicAdd(&solverBodyDeltaVel[solverBodyIndex + numSolverBodies].y, angVel.y*ratio);
							atomicAdd(&solverBodyDeltaVel[solverBodyIndex + numSolverBodies].z, angVel.z*ratio);

						}
					}
					//printf("solverBodyIndex %i\n", solverBodyIndex);
					//printf("linearVelocity(%f, %f, %f, %f)\n", linearVelocity.x, linearVelocity.y, linearVelocity.z, linearVelocity.w);
					//printf("angularVelocity(%f, %f, %f, %f)\n", angularVelocity.x, angularVelocity.y, angularVelocity.z, angularVelocity.w);
				}
			}
		}
	}
}