// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of NVIDIA CORPORATION nor the names of its
//    contributors may be used to endorse or promote products derived
//    from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  

#include "common/PxPhysXCommonConfig.h"
#include <cuda.h>
#include <sm_35_intrinsics.h>
#include "PxgSolverBody.h"
#include "PxgSolverConstraintBlock1D.h"
#include "PxgSolverConstraintDesc.h"
#include "PxgConstraint.h"
#include "PxgConstraintBlock.h"
#include "PxgIslandContext.h"
#include "PxgSolverContext.h"
#include "cutil_math.h"
#include "PxgSolverCoreDesc.h"
#include "solverBlock.cuh"
#include "PxgSolverKernelIndices.h"
#include "PxgDynamicsConfiguration.h"
#include "PxgIntrinsics.h"
#include "stdio.h"
#include "assert.h"
#include "reduction.cuh"
#include "solver.cuh"
#include "PxgArticulationCoreDesc.h"

using namespace physx;

extern "C" __host__ void initSolverKernels6() {}

PX_CUDA_CALLABLE PX_FORCE_INLINE PxU32 ComputeBodyBatchStartIndex(const PxU32 index)
{
	//return 2*(index & (~31)) + (index&31);
	return index;
}

extern "C" __global__ void ZeroBodies(const PxgSolverCoreDesc* constraintPrepDesc, const PxgSolverSharedDesc<IterativeSolveData>* sharedDesc)
{
	__shared__ float4* bodyVelocities;
	__shared__ float4* motionVelocities;
	__shared__ uint totalNumBodies;
	__shared__ uint totalNumBodiesConstraints;
	__shared__ uint offset;

	if(threadIdx.x == 0)
	{
		bodyVelocities = sharedDesc->iterativeData.solverBodyVelPool;
		motionVelocities = constraintPrepDesc->motionVelocityArray;
		totalNumBodies = constraintPrepDesc->numSolverBodies*2;
		totalNumBodiesConstraints = (constraintPrepDesc->numBatches + constraintPrepDesc->numArticBatches) * 32 * 2 *2;
		offset = constraintPrepDesc->accumulatedBodyDeltaVOffset;

	}

	const uint blockStride = (blockDim.x * gridDim.x);

	//This identifies which warp a specific thread is in
	const uint threadIndex = (threadIdx.x + blockIdx.x * blockDim.x);

	__syncthreads();

	const float4 zero = make_float4(0.f);
	//(1) Set all motion velocities to zero.
	//(2) Set the delta velocities in the accumulation offset to zero
	for(uint a = threadIndex; a < totalNumBodies; a+=blockStride)
	{
		motionVelocities[a]					= zero;
		bodyVelocities[offset + a]			= zero;
	}
	
	//(2) Set all velocities to zero. Strictly, we only need to set the first instance of each body
	// in the solver to 0 but, for now, we'll just initialize them all to zero...
	for(uint a = threadIndex; a < totalNumBodiesConstraints; a+= blockStride)
	{
		bodyVelocities[a]					= zero;
	}
}

__device__ __inline__ float4 loadFloat4(const float4* PX_RESTRICT address)
{
	/*float4 ret;

	asm("ld.global.cg.v4.f32 {%0, %1, %2, %3}, [%4];" : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : "r"(address));
	return ret;*/
	return *address;
}

PX_FORCE_INLINE __device__ float4 cudeShuffle3(const PxU32 syncMask, bool condition, float4 reg0, float4 reg1, PxU32 shuffleMask)
{
	float4 ret0, ret1;

	ret0.x = __shfl_sync(syncMask, reg0.x, shuffleMask);
	ret0.y = __shfl_sync(syncMask, reg0.y, shuffleMask);
	ret0.z = __shfl_sync(syncMask, reg0.z, shuffleMask);

	ret1.x = __shfl_sync(syncMask, reg1.x, shuffleMask);
	ret1.y = __shfl_sync(syncMask, reg1.y, shuffleMask);
	ret1.z = __shfl_sync(syncMask, reg1.z, shuffleMask);

	return condition ? ret0 : ret1;
}

// Marking active slabs loosely following "solveBlockPartition"
static __device__ void markActiveSlab_rigidBodyPGS(
	const PxgSolverCoreDesc* PX_RESTRICT solverDesc, 
	const PxgSolverSharedDesc<IterativeSolveData>* PX_RESTRICT sharedDesc,
	const PxU32 islandIndex, const PxU32 lastPartition)
{
	const PxgIslandContext& island = solverDesc->islandContextPool[islandIndex];
	const PxU32 startPartitionIndex = island.mStartPartitionIndex;

	const PxU32 startIndex = island.mBatchStartIndex;
	const PxU32 endIndex = solverDesc->constraintsPerPartition[lastPartition + startPartitionIndex];

	const IterativeSolveData& iterativeData = sharedDesc->iterativeData;

	const PxU32 globalWarpIndex = blockIdx.x * blockDim.y + threadIdx.y;
	const PxU32 threadIndexInWarp = threadIdx.x;
	const PxU32 batchIndex = startIndex + globalWarpIndex;

	if (batchIndex < endIndex)
	{
		const PxgBlockConstraintBatch& batch = iterativeData.blockConstraintBatch[batchIndex];

		if (threadIndexInWarp < batch.mDescStride)
		{
			const PxU32 bodyOffset = island.mBodyStartIndex;
			const PxU32 slabId = batch.slabId[threadIndexInWarp];

			const PxU32 outputOffset = solverDesc->accumulatedBodyDeltaVOffset; //deltaVOffset
			const PxU32 numDynamicBodies = solverDesc->islandContextPool->mBodyCount; //nbBodies minus offset!
			const PxU32 totalBodiesIncKinematics = numDynamicBodies + bodyOffset;
			
			const PxU32 bodyIdA = batch.bodyAIndex[threadIndexInWarp];
			const PxU32 bodyIdB = batch.bodyBIndex[threadIndexInWarp];

			const PxU32 numArticulations = solverDesc->islandContextPool->mArticulationCount;
			const PxU32 numTotalBodies = bodyOffset + numDynamicBodies + numArticulations;

			const PxU32 slabIndexOffset = (slabId / 32) * numTotalBodies;
			const PxU32 encodedSlabIndex = (1u << (slabId % 32));

			bool isActiveSlab = false;

			if (batch.constraintType == PxgSolverConstraintDesc::eCONSTRAINT_1D)
			{
				// For joint constraints, simply mark slabs as active for efficiency.
				// This does not significantly affect or increase the reference count in practice.
				isActiveSlab = true;
			}
			else
			{
				const PxU32 finalIdA = outputOffset + bodyIdA;
				const PxU32 finalIdB = outputOffset + bodyIdB;

				const float4 linVel0 = Pxldcg(iterativeData.solverBodyVelPool[finalIdA]);
				const float4 angVel0 = Pxldcg(iterativeData.solverBodyVelPool[finalIdA + totalBodiesIncKinematics]);
				const float4 linVel1 = Pxldcg(iterativeData.solverBodyVelPool[finalIdB]);
				const float4 angVel1 = Pxldcg(iterativeData.solverBodyVelPool[finalIdB + totalBodiesIncKinematics]);

				PxVec3 lv0(linVel0.x, linVel0.y, linVel0.z);
				PxVec3 lv1(linVel1.x, linVel1.y, linVel1.z);
				PxVec3 av0(angVel0.x, angVel0.y, angVel0.z);
				PxVec3 av1(angVel1.x, angVel1.y, angVel1.z);

				// Check if the contact/normal constraint is active.
				isActiveSlab = checkActiveContactBlock(batch, lv0, av0, lv1, av1, threadIndexInWarp, iterativeData.blockContactHeaders,
													   iterativeData.blockContactPoints);
			}

			// Encode which slab is active in a 32-bit index. When querying reference counts, count the number of active
			// slabs encoded in solverEncodedReferenceCount. solverEncodedReferenceCount contains bitwise/slab-wise
			// activation information.
			if (isActiveSlab & (bodyIdA >= bodyOffset))
			{
				atomicOr(&iterativeData.solverEncodedReferenceCount[slabIndexOffset + bodyIdA], encodedSlabIndex);
			}
			if (isActiveSlab & (bodyIdB >= bodyOffset))
			{
				atomicOr(&iterativeData.solverEncodedReferenceCount[slabIndexOffset + bodyIdB], encodedSlabIndex);
			}
		}
	}
}

// Marking active slabs loosely following "artiSolveBlockPartition"
static __device__ void markActiveSlab_articulationPGS(
	const PxgSolverCoreDesc* PX_RESTRICT solverDesc, const PxgSolverSharedDesc<IterativeSolveData>* PX_RESTRICT sharedDesc,
	const PxU32 islandIndex, const PxU32 lastPartition, const PxgArticulationCoreDesc* const PX_RESTRICT artiDesc)
{
	const PxgIslandContext& island = solverDesc->islandContextPool[islandIndex];
	const PxU32 startPartitionIndex = island.mStartPartitionIndex;

	const PxU32 startIndex = island.mArtiBatchStartIndex;
	const PxU32 articulationBatchOffset = solverDesc->islandContextPool->mBatchCount;
	const PxU32 endIndex = solverDesc->artiConstraintsPerPartition[lastPartition + startPartitionIndex] + articulationBatchOffset;

	const IterativeSolveData& iterativeData = sharedDesc->iterativeData;

	const PxU32 globalWarpIndex = blockIdx.x * blockDim.y + threadIdx.y;
	const PxU32 threadIndexInWarp = threadIdx.x;
	const PxU32 batchIndex = startIndex + globalWarpIndex + articulationBatchOffset;

	const PxU32 bodyOffset = island.mBodyStartIndex;
	const PxU32 numDynamicBodies = solverDesc->islandContextPool->mBodyCount; //nbBodies minus offset!

	if (batchIndex < endIndex)
	{
		const PxgBlockConstraintBatch& batch = iterativeData.blockConstraintBatch[batchIndex];

		if (threadIndexInWarp < batch.mDescStride)
		{
			const PxNodeIndex igNodeIndexA = batch.bodyANodeIndex[threadIndexInWarp];
			const PxNodeIndex igNodeIndexB = batch.bodyBNodeIndex[threadIndexInWarp];

			const PxU32 slabId = batch.slabId[threadIndexInWarp];

			const PxU32 nodeIndexA = igNodeIndexA.index();
			const PxU32 nodeIndexB = igNodeIndexB.index();

			const PxU32 bodyIdA = batch.bodyAIndex[threadIndexInWarp];
			const PxU32 bodyIdB = batch.bodyBIndex[threadIndexInWarp];

			PxU32 linkIndexA = igNodeIndexA.articulationLinkId();
			PxU32 linkIndexB = igNodeIndexB.articulationLinkId();

			Cm::UnAlignedSpatialVector vel0, vel1;

			const PxU32 numArticulations = solverDesc->islandContextPool->mArticulationCount;
			const PxU32 numTotalBodies = bodyOffset + numDynamicBodies + numArticulations;

			const PxU32 slabIndexOffset = (slabId / 32) * numTotalBodies;
			const PxU32 encodedSlabIndex = (1u << (slabId % 32));


			bool isActiveSlab = false;

			const PxU32 constraintType = batch.constraintType;
	
			if (constraintType == PxgSolverConstraintDesc::eARTICULATION_CONSTRAINT_1D) // joint
			{
				// For joint constraints, simply mark slabs as active for efficiency.
				// This does not significantly affect or increase the reference count in practice.
				isActiveSlab = true;
			}
			else // contact
			{
				const PxU32 readIndex = batchIndex * 128 + threadIndexInWarp;

				const PxU32 outputOffset = solverDesc->accumulatedBodyDeltaVOffset;
				const PxU32 totalBodiesIncKinematics = numDynamicBodies + bodyOffset;

				Cm::UnAlignedSpatialVector vel0, vel1;
				if (igNodeIndexA.isArticulation())
				{
					// For articulations, read velocities using readIndex as done in artiSolveBlockPartition.
					const float4 lin = Pxldcg(iterativeData.solverBodyVelPool[readIndex]);
					const float4 ang = Pxldcg(iterativeData.solverBodyVelPool[readIndex + 32]);
					vel0 = Cm::UnAlignedSpatialVector(PxVec3(ang.x, ang.y, ang.z), PxVec3(lin.x, lin.y, lin.z));
				}
				else
				{
					// For rigid bodies, use the original rigid body velocity, not a slab velocity, in case velocities
					// at readIndex are not set.
					const PxU32 finalIdA = outputOffset + bodyIdA;
					const float4 lin = Pxldcg(iterativeData.solverBodyVelPool[finalIdA]);
					const float4 ang = Pxldcg(iterativeData.solverBodyVelPool[finalIdA + totalBodiesIncKinematics]);
					vel0 = Cm::UnAlignedSpatialVector(PxVec3(ang.x, ang.y, ang.z), PxVec3(lin.x, lin.y, lin.z));
				}

				if (igNodeIndexB.isArticulation())
				{
					// For articulations, read velocities using readIndex as done in artiSolveBlockPartition.
					const float4 lin = Pxldcg(iterativeData.solverBodyVelPool[readIndex + 64]);
					const float4 ang = Pxldcg(iterativeData.solverBodyVelPool[readIndex + 96]);
					vel1 = Cm::UnAlignedSpatialVector(PxVec3(ang.x, ang.y, ang.z), PxVec3(lin.x, lin.y, lin.z));
				}
				else
				{
					// For rigid bodies, use the original rigid body velocity, not a slab velocity, in case velocities
					// at readIndex are not set.
					const PxU32 finalIdB = outputOffset + bodyIdB;
					const float4 lin = Pxldcg(iterativeData.solverBodyVelPool[finalIdB]);
					const float4 ang = Pxldcg(iterativeData.solverBodyVelPool[finalIdB + totalBodiesIncKinematics]);
					vel1 = Cm::UnAlignedSpatialVector(PxVec3(ang.x, ang.y, ang.z), PxVec3(lin.x, lin.y, lin.z));
				}

				// Check if the contact/normal constraint is active.
				isActiveSlab = checkExtActiveContactBlock(batch, vel0, vel1, iterativeData.blockContactHeaders,
					iterativeData.blockContactPoints, iterativeData.artiResponse, threadIndexInWarp);
			}

			if (isActiveSlab)
			{
				// Encode which slab is active in a 32-bit index. When querying reference counts, count the number of
				// active slabs encoded in solverEncodedReferenceCount. solverEncodedReferenceCount contains
				// bitwise/slab-wise activation information.
				if (igNodeIndexA.isArticulation()) // articulation
				{
					const PxU32 articulationBodyIdA = batch.remappedBodyAIndex[threadIndexInWarp];

					// Articulation IDs are at the back of rigid body IDs.
					const PxU32 globalBodyIdA = articulationBodyIdA + numDynamicBodies + bodyOffset;

					atomicOr(&iterativeData.solverEncodedReferenceCount[slabIndexOffset + globalBodyIdA], encodedSlabIndex);
				}
				else if (bodyIdA >= bodyOffset) // rigid
				{
					atomicOr(&iterativeData.solverEncodedReferenceCount[slabIndexOffset + bodyIdA], encodedSlabIndex);
				}

				if (igNodeIndexB.isArticulation()) // articulation
				{
					const PxU32 articulationBodyIdB = batch.remappedBodyBIndex[threadIndexInWarp];

					// Articulation IDs are at the back of rigid body IDs.
					const PxU32 globalBodyIdB = articulationBodyIdB + numDynamicBodies + bodyOffset;

					atomicOr(&iterativeData.solverEncodedReferenceCount[slabIndexOffset + globalBodyIdB], encodedSlabIndex);
				}
				else if (bodyIdB >= bodyOffset) // rigid
				{
					atomicOr(&iterativeData.solverEncodedReferenceCount[slabIndexOffset + bodyIdB], encodedSlabIndex);
				}
			}
		}
	}
}

// Marking active rigid body slabs, loosely following "solveBlockPartition" and "artiSolveBlockPartition"
extern "C" __global__
__launch_bounds__(PxgKernelBlockDim::SOLVE_BLOCK_PARTITION, 8)
void markActiveSlabPGS(const PxgSolverCoreDesc * PX_RESTRICT solverDesc,
	const PxgSolverSharedDesc<IterativeSolveData>* PX_RESTRICT sharedDesc,
	const PxU32 islandIndex, const PxU32 lastPartition, const PxgArticulationCoreDesc* const PX_RESTRICT artiDesc)
{
	if (blockIdx.y == 0)
	{
		markActiveSlab_rigidBodyPGS(solverDesc, sharedDesc, islandIndex, lastPartition);
	}
	else
	{
		markActiveSlab_articulationPGS(solverDesc, sharedDesc, islandIndex, lastPartition, artiDesc);
	}
}

extern "C" __global__
//__launch_bounds__(PxgKernelBlockDim::SOLVE_BLOCK_PARTITION, 16)
void solveBlockPartition(
	PxgSolverCoreDesc* PX_RESTRICT solverDesc, const PxgSolverSharedDesc<IterativeSolveData>* PX_RESTRICT sharedDesc,
	const PxU32 islandIndex, const PxU32 partitionIndex, bool doFriction)
{	
	const PxgIslandContext& island = solverDesc->islandContextPool[islandIndex];

	const PxU32 startPartitionIndex = island.mStartPartitionIndex;

	PxU32 startIndex = partitionIndex == 0 ? island.mBatchStartIndex : solverDesc->constraintsPerPartition[partitionIndex + startPartitionIndex - 1];

	PxU32 endIndex = solverDesc->constraintsPerPartition[partitionIndex + startPartitionIndex];

	const uint warpSize = 32;

	//This identifies which warp a specific thread is in
	const uint warpIndex = (threadIdx.x + blockIdx.x * blockDim.x)/warpSize;

	//This identifies which thread within a warp a specific thread is
	const uint threadIndexInWarp = threadIdx.x&(warpSize-1);

	__shared__ IterativeSolveData iterativeData;

	PxU32 idx = threadIdx.x;

	//if(threadIdx.x < sizeof(iterativeData)/sizeof(float))
	while(idx < sizeof(iterativeData) / sizeof(float))
	{
		float* iterData = reinterpret_cast<float*>(&iterativeData);

		iterData[idx] = reinterpret_cast<const float*>(&sharedDesc->iterativeData)[idx];
		idx += warpSize;
	}

	__syncthreads();

	bool residualAccumulationEnabled = solverDesc->contactErrorAccumulator.mCounter >= 0;
	PxgErrorAccumulator error;

	//for(uint k = startIndex + warpIndex; k < endIndex; k+=blockStride)
	uint k = startIndex + warpIndex;
	if(k < endIndex)
	{
		assert(k < solverDesc->numBatches);

		const PxgBlockConstraintBatch& batch = iterativeData.blockConstraintBatch[k];

		const PxU32 readIndex = k*128 + threadIndexInWarp;

		//Pull out shared memory into float4 format in registers to solve constraints
		if(threadIndexInWarp < batch.mDescStride)
		{
			//The linear/angular velocity pair for body 0 for threads 0 - 15, loaded by threads 0 - 31 in the format (b0.linVel, b0.angVel, b0.linVel, b0.angVel...)
			float4 linVel0 = Pxldcg(iterativeData.solverBodyVelPool[readIndex]);
			//The linear/angular velocity pair for body 0 for threads 16-31, loaded by threads 0 - 31  in the format (b0.linVel, b0.angVel, b0.linVel, b0.angVel...)
			float4 angVel0 = Pxldcg(iterativeData.solverBodyVelPool[readIndex + 32]);
			//The linear/angular velocity pair for body 1 for threads 0 - 15, loaded by threads 0 - 31 in the format (b1.linVel, b1.angVel, b1.linVel, b1.angVel...)
			float4 linVel1 = Pxldcg(iterativeData.solverBodyVelPool[readIndex + 64]);
			//The linear/angular velocity pair for body 1 for threads 16 - 31, loaded by threads 0 - 31 in the format (b1.linVel, b1.angVel, b1.linVel, b1.angVel...)
			float4 angVel1 = Pxldcg(iterativeData.solverBodyVelPool[readIndex + 96]);

			//printf("Rigid ReadIndex = %i, linVel0=(%f, %f, %f), angVel0=(%f, %f, %f)\n", readIndex, linVel0.x, linVel0.y, linVel0.z, angVel0.x, angVel0.y, angVel0.z);

			//printf("Rigid ReadIndex = %i, linVel0=(%f, %f, %f), angVel0=(%f, %f, %f)\n", readIndex+64, linVel1.x, linVel1.y, linVel1.z, angVel1.x, angVel1.y, angVel1.z);

			PxVec3 lv0(linVel0.x, linVel0.y, linVel0.z);
			PxVec3 lv1(linVel1.x, linVel1.y, linVel1.z);
			PxVec3 av0(angVel0.x, angVel0.y, angVel0.z);
			PxVec3 av1(angVel1.x, angVel1.y, angVel1.z);

			// Reference counts to be used for the current sub-timestep or iteration.
			PxReal curRef0 = 1.0f;
			PxReal curRef1 = 1.0f;

			const PxU32 bodyOffset = solverDesc->islandContextPool->mBodyStartIndex;

			const PxU32 numDynamicBodies = solverDesc->islandContextPool->mBodyCount; //nbBodies minus offset!
			const PxU32 numArticulations = solverDesc->islandContextPool->mArticulationCount;
			const PxU32 numTotalBodies = bodyOffset + numDynamicBodies + numArticulations;

			const PxU32* const PX_RESTRICT encodedReferenceCount = sharedDesc->iterativeData.solverEncodedReferenceCount;

			if(batch.bodyAIndex[threadIndexInWarp] >= bodyOffset)
			{
				// Counting the number of active slabs
				curRef0 = static_cast<PxReal>(
					countActiveSlabs(batch.bodyAIndex[threadIndexInWarp], solverDesc->numSlabs, numTotalBodies, encodedReferenceCount));
			}

			if(batch.bodyBIndex[threadIndexInWarp] >= bodyOffset)
			{
				// Counting the number of active slabs
				curRef1 = static_cast<PxReal>(
					countActiveSlabs(batch.bodyBIndex[threadIndexInWarp], solverDesc->numSlabs, numTotalBodies, encodedReferenceCount));
			}

			if (batch.constraintType == PxgSolverConstraintDesc::eCONTACT)
				solveContactBlock(batch, lv0, av0, lv1, av1, doFriction, threadIndexInWarp, iterativeData.blockContactHeaders, iterativeData.blockFrictionHeaders,
					iterativeData.blockContactPoints, iterativeData.blockFrictions, residualAccumulationEnabled ? &error : NULL, 
					curRef0, curRef1);
			else
				solve1DBlock(batch, lv0, av0, lv1, av1, threadIndexInWarp, iterativeData.blockJointConstraintHeaders, iterativeData.blockJointConstraintRowsCon,
					iterativeData.blockJointConstraintRowsMod, solverDesc->contactErrorAccumulator.mCounter >= 0, curRef0, curRef1);


			const PxU32 remapA = batch.remappedBodyAIndex[threadIndexInWarp];
			const PxU32 remapB = batch.remappedBodyBIndex[threadIndexInWarp];
			
			const PxU32 indexA = ComputeBodyBatchStartIndex(remapA);
			const PxU32 indexB = ComputeBodyBatchStartIndex(remapB);

			Pxstcg(&iterativeData.solverBodyVelPool[indexA], make_float4(lv0.x, lv0.y, lv0.z, 0.f));
			Pxstcg(&iterativeData.solverBodyVelPool[indexA + 32], make_float4(av0.x, av0.y, av0.z, 0.f));
			Pxstcg(&iterativeData.solverBodyVelPool[indexB], make_float4(lv1.x, lv1.y, lv1.z, 0.f));
			Pxstcg(&iterativeData.solverBodyVelPool[indexB + 32], make_float4(av1.x, av1.y, av1.z, 0.f));
		}

		//KS - Even threads in a warp output the linear velocities, odd threads output the angular velocities.
		//We output for threadIndexInWarp/2 and 16 + threadIndexInWarp/2
		
#if 0
		const PxU32 firstWarpOutput = threadIndexInWarp/2;
		const PxU32 secondWarpOutput = threadIndexInWarp/2 + 16;

		float4 indexAOutput0 = cudeShuffle3(threadIndexInWarp & 1, angVel0, linVel0, firstWarpOutput);
		float4 indexBOutput0 = cudeShuffle3(threadIndexInWarp & 1, angVel1, linVel1, firstWarpOutput);

		float4 indexAOutput1 = cudeShuffle3(threadIndexInWarp & 1, angVel0, linVel0, secondWarpOutput);
		float4 indexBOutput1 = cudeShuffle3(threadIndexInWarp & 1, angVel1, linVel1, secondWarpOutput);

		if(firstWarpOutput < batch.mDescStride)
		{
			const PxU32 indexA = (2*batch.remappedBodyAIndex[firstWarpOutput] + (threadIdx.x&1));
			const PxU32 indexB = (2*batch.remappedBodyBIndex[firstWarpOutput] + (threadIdx.x&1));

			iterativeData.solverBodyVelPool[indexA] = indexAOutput0;
			iterativeData.solverBodyVelPool[indexB] = indexBOutput0;
		}

		if(secondWarpOutput < batch.mDescStride)
		{
			const PxU32 indexA = (2*batch.remappedBodyAIndex[secondWarpOutput] + (threadIdx.x&1));
			const PxU32 indexB = (2*batch.remappedBodyBIndex[secondWarpOutput] + (threadIdx.x&1));

			iterativeData.solverBodyVelPool[indexA] = indexAOutput1;
			iterativeData.solverBodyVelPool[indexB] = indexBOutput1;
		}
#else

#endif
	}

	if (residualAccumulationEnabled)
	{
		error.accumulateErrorGlobalFullWarp(solverDesc->contactErrorAccumulator, threadIndexInWarp);
	}
}

extern "C" __global__ void writebackBlocks(
	PxgSolverCoreDesc* constraintPrepDesc, 
	PxgSolverSharedDesc<IterativeSolveData>* sharedDesc, const PxU32 islandIndex)
{	
	PxgIslandContext& island = constraintPrepDesc->islandContextPool[islandIndex];

	const PxU32 startIndex = island.mBatchStartIndex;
	const PxU32 endIndex = island.mBatchCount + island.mArtiBatchCount + island.mStaticArtiBatchCount + 
		island.mSelfArtiBatchCount + island.mStaticRigidBatchCount + startIndex;

	const uint warpSize = 32;

	//const uint blockStride = (blockDim.x * gridDim.x)/warpSize;

	//This identifies which warp a specific thread is in
	const uint warpIndex = (threadIdx.x + blockIdx.x * blockDim.x)/warpSize;

	const uint warpIndexInBlock = threadIdx.x/warpSize;

	//This identifies which thread within a warp a specific thread is
	const uint threadIndexInWarp = threadIdx.x&(warpSize-1);

	PxgBlockConstraintBatch* batchHeaders = sharedDesc->iterativeData.blockConstraintBatch;

	PxgBlockSolverConstraint1DHeader* jointHeaders = sharedDesc->iterativeData.blockJointConstraintHeaders;
	PxgBlockSolverConstraint1DCon* jointRowsCon = sharedDesc->iterativeData.blockJointConstraintRowsCon;
	PxgBlockSolverConstraint1DMod* jointRowsMod = sharedDesc->iterativeData.blockJointConstraintRowsMod;

	PxgConstraintWriteback* constraintWriteBack = constraintPrepDesc->constraintWriteBack;

	PxgSolverBodyData* solverBodyDatas = constraintPrepDesc->solverBodyDataPool;

	PxgBlockSolverContactHeader* contactHeaders = sharedDesc->iterativeData.blockContactHeaders;
	PxgBlockSolverFrictionHeader* frictionHeaders = sharedDesc->iterativeData.blockFrictionHeaders;
	PxgBlockSolverContactPoint* contactPoints = sharedDesc->iterativeData.blockContactPoints;
	PxgBlockSolverContactFriction* frictions = sharedDesc->iterativeData.blockFrictions;

	PxgBlockFrictionPatch* baseFrictionPatches = sharedDesc->blockCurrentFrictionPatches;
	PxF32* baseWritebackForceBuffer = constraintPrepDesc->forceBuffer;

	PxgFrictionPatchGPU* frictionPatches = reinterpret_cast<PxgFrictionPatchGPU*>(constraintPrepDesc->frictionPatches);

	//__shared__  Dy::ThresholdStreamElement elems[PxgKernelBlockDim::WRITEBACK_BLOCKS];
	__shared__  PxU8 elemsMem[sizeof(Dy::ThresholdStreamElement)*PxgKernelBlockDim::WRITEBACK_BLOCKS];
	Dy::ThresholdStreamElement* elems = reinterpret_cast<Dy::ThresholdStreamElement*>(elemsMem);
	__shared__  PxI32 index[PxgKernelBlockDim::WRITEBACK_BLOCKS/warpSize];

	Dy::ThresholdStreamElement* startAddress = &elems[32*warpIndexInBlock];

	//for(uint k = startIndex + warpIndex; k < endIndex; k+=blockStride)
	uint k = startIndex + warpIndex;
	if(k < endIndex)
	{
		if(threadIndexInWarp == 0)
			index[warpIndexInBlock] = 0;

		__syncwarp();

		//Get block header
		const PxgBlockConstraintBatch& batch = batchHeaders[k];

		if(threadIndexInWarp < batch.mDescStride)
		{
			if(batch.constraintType==PxgSolverConstraintDesc::eCONTACT || batch.constraintType == PxgSolverConstraintDesc::eARTICULATION_CONTACT)
			{
				writeBackContactBlock(batch, threadIndexInWarp, solverBodyDatas, startAddress, &index[warpIndexInBlock], contactHeaders, frictionHeaders, contactPoints, frictions,
					baseWritebackForceBuffer, baseFrictionPatches[batch.mConstraintBatchIndex], frictionPatches);
			}
			else
			{
				//Do nothing (for now)
				writeBack1DBlock(batch, threadIndexInWarp, jointHeaders, jointRowsCon, jointRowsMod, constraintWriteBack);
			}
		}

		//__syncthreads();

		__syncwarp();

		PxI32 ind = index[warpIndexInBlock];

		if(ind > 0)
		{
			__shared__ PxI32 startIndex[PxgKernelBlockDim::WRITEBACK_BLOCKS/warpSize];
			if(threadIndexInWarp == 0)
			{
				startIndex[warpIndexInBlock] = atomicAdd(&constraintPrepDesc->sharedThresholdStreamIndex, ind);
			}

			__syncwarp();

			if(threadIndexInWarp < ind)
			{
				//((float4*)constraintPrepDesc->thresholdStream)[startIndex[warpIndexInBlock]+threadIndexInWarp] = ((float4*)startAddress)[threadIndexInWarp];
				constraintPrepDesc->thresholdStream[startIndex[warpIndexInBlock]+threadIndexInWarp] = startAddress[threadIndexInWarp];
			}
		}
	}
}

extern "C" __global__ void concludeBlocks(
	PxgSolverCoreDesc* constraintPrepDesc,
	PxgSolverSharedDesc<IterativeSolveData>* sharedDesc, const PxU32 islandIndex)
{	
	PxgIslandContext& island = constraintPrepDesc->islandContextPool[islandIndex];

	const PxU32 startIndex = island.mBatchStartIndex;
	const PxU32 endIndex = island.mBatchCount + island.mArtiBatchCount + island.mStaticArtiBatchCount 
		+ island.mSelfArtiBatchCount + island.mStaticRigidBatchCount + startIndex;

	
	const uint warpSize = 32;

	//const uint blockStride = (blockDim.x * gridDim.x)/warpSize;

	//This identifies which warp a specific thread is in
	const uint warpIndex = (threadIdx.x + blockIdx.x * blockDim.x)/warpSize;

	//This identifies which thread within a warp a specific thread is
	const uint threadIndexInWarp = threadIdx.x&(warpSize-1);

	PxgBlockConstraintBatch* batchHeaders = sharedDesc->iterativeData.blockConstraintBatch;

	PxgBlockSolverConstraint1DHeader* jointHeaders = sharedDesc->iterativeData.blockJointConstraintHeaders;
	PxgBlockSolverConstraint1DMod* jointRowsMod = sharedDesc->iterativeData.blockJointConstraintRowsMod;

	PxgBlockSolverContactHeader* contactHeaders = sharedDesc->iterativeData.blockContactHeaders;
	PxgBlockSolverFrictionHeader* frictionHeaders = sharedDesc->iterativeData.blockFrictionHeaders;
	PxgBlockSolverContactPoint* contactPoints = sharedDesc->iterativeData.blockContactPoints;
	PxgBlockSolverContactFriction* frictions = sharedDesc->iterativeData.blockFrictions;

	//for(uint k = startIndex + warpIndex; k < endIndex; k+=blockStride)
	uint k = startIndex + warpIndex; 
	if(k < endIndex)
	{
		//Get block header
		const PxgBlockConstraintBatch& batch = batchHeaders[k];

		if(threadIndexInWarp < batch.mDescStride)
		{
			if(batch.constraintType==PxgSolverConstraintDesc::eCONTACT || batch.constraintType == PxgSolverConstraintDesc::eARTICULATION_CONTACT)
			{
				concludeContactBlock(batch, threadIndexInWarp, contactHeaders, frictionHeaders, contactPoints, frictions);
			}
			else
			{
				conclude1DBlock(batch, threadIndexInWarp, jointHeaders, jointRowsMod);
			}
		}
	}
}

extern "C" __global__ void writeBackBodies(
	const PxgSolverCoreDesc* constraintPrepDesc,
	const PxgSolverSharedDesc<IterativeSolveData>* sharedDesc, const PxU32 islandIndex)
{	
	__shared__ float4* bodyVelocities;
	__shared__ float4* motionVelocities;
	__shared__ PxU32 bodyStartIndex;
	__shared__ PxU32 bodyEndIndex;
	__shared__ PxU32 totalBodyCount;
	__shared__ PxU32 outputOffsetIndex;

	if(threadIdx.x == 0)
	{
		PxgIslandContext& island = constraintPrepDesc->islandContextPool[islandIndex];
		bodyVelocities = sharedDesc->iterativeData.solverBodyVelPool;
		motionVelocities = constraintPrepDesc->motionVelocityArray;
		bodyStartIndex = island.mBodyStartIndex;
		totalBodyCount = constraintPrepDesc->numSolverBodies;
		bodyEndIndex = totalBodyCount;
		outputOffsetIndex = sharedDesc->deltaOutOffset;
	}
	__syncthreads();

	//const uint blockStride = (blockDim.x * gridDim.x);

	//This identifies which warp a specific thread is in
	const uint threadIndex = (threadIdx.x + blockIdx.x * blockDim.x);

	//for(uint a = threadIndex+bodyStartIndex; a < bodyEndIndex; a+=blockStride)
	uint a = threadIndex+bodyStartIndex;
	if(a < bodyEndIndex)
	{
		motionVelocities[a]						= bodyVelocities[a + outputOffsetIndex];		//Linear velocity
		motionVelocities[a+totalBodyCount]		= bodyVelocities[a + outputOffsetIndex + totalBodyCount];	//Angular velocity stored after all linear velocities
	}
}

extern "C" __global__ void computeAverageSolverBodyVelocity(
	const PxgSolverCoreDesc* const PX_RESTRICT solverDesc,
	const PxgSolverSharedDesc<IterativeSolveData>* PX_RESTRICT sharedDesc)
{
	//we need to fill in the writeIndex for the solver body data
	const PxgSolverReferences* const PX_RESTRICT solverReferences = solverDesc->solverBodyReferences;
	float4* bodyVelocities = sharedDesc->iterativeData.solverBodyVelPool;

	PxU32* encodedReferenceCount = sharedDesc->iterativeData.solverEncodedReferenceCount;
	const PxU32 totalBodyCount = solverDesc->islandContextPool->mBodyCount; //nbBodies minus offset!
	const PxU32 bodyOffset = solverDesc->islandContextPool->mBodyStartIndex;
	const PxU32 numSlabs = solverDesc->numSlabs;
	const PxU32 numBatches = solverDesc->numBatches;
	const PxU32 numArticBatches = solverDesc->numArticBatches;
	const PxU32 deltaVOffset = solverDesc->accumulatedBodyDeltaVOffset;

	const PxU32 averageOutputOffsets = (numBatches + numArticBatches) * PXG_BATCH_SIZE * 2 * 2;

	const PxU32 numThreadsPerBody = PxMin(isPowerOfTwo(numSlabs) ? numSlabs : nextPowerOfTwo(numSlabs), 32u);
	const PxU32 numBodiesPerWarp = WARP_SIZE / numThreadsPerBody;

	const uint warpIndex = (blockIdx.x * blockDim.y + threadIdx.y);

	const uint warpBodyStartIndex = warpIndex * numBodiesPerWarp;

	const PxU32 threadIdInWorkUnit = threadIdx.x & (numThreadsPerBody - 1);
	const PxU32 subWarpIndex = threadIdx.x / numThreadsPerBody;

	const PxU32 bodyInWarp = threadIdx.x / numThreadsPerBody;

	const PxU32 maskHigh = ((1 << ((subWarpIndex + 1) * numThreadsPerBody)) - 1);
	const PxU32 maskLow = ((1 << ((subWarpIndex)*numThreadsPerBody)) - 1);
	const PxU32 mask = maskHigh - maskLow;

	const uint bodyId = warpBodyStartIndex + bodyInWarp;

	if (bodyId < totalBodyCount)
	{
		float4 linDelta = make_float4(0.f);
		float4 angDelta = make_float4(0.f);

		const PxU32 outputBody = deltaVOffset + bodyId + bodyOffset;

		//Store linear and angular velocity!!!
		const float4 lastLinearVel = bodyVelocities[outputBody];
		const float4 lastAngularVel = bodyVelocities[outputBody + totalBodyCount + bodyOffset];

		bool hasRef = false;
		//get out velocity for the correponding body in a slab and put on weight to calculate the final velocity for that body
		for (uint b = threadIdInWorkUnit; b < numSlabs; b += numThreadsPerBody)
		{
			const uint bodyIndex = bodyId * numSlabs + b;
			PxgSolverReferences reference = solverReferences[bodyIndex];

			PxU32 remappedBodyIndex = reference.mRemappedBodyIndex;

			if (remappedBodyIndex != 0xFFFFFFFF)
			{
				//Outputs are grouped into little clusters of 32...
				const uint velIndex = averageOutputOffsets + ComputeAverageBodyBatchStartIndex(bodyIndex);

				const float4 curLinearDeltaV = bodyVelocities[velIndex];
				const float4 curAngularDeltaV = bodyVelocities[velIndex + 32];

				float4 lDelta = (curLinearDeltaV - lastLinearVel);
				float4 aDelta = (curAngularDeltaV - lastAngularVel);

				{
					linDelta += lDelta;
					angDelta += aDelta;
				}
				hasRef = true;
			}
		}

		//Now do the reduction...
#pragma unroll
		for (PxU32 reductionRadius = numThreadsPerBody >> 1; reductionRadius > 0; reductionRadius >>= 1)
		{
			linDelta.x += __shfl_xor_sync(mask, linDelta.x, reductionRadius, numThreadsPerBody);
			linDelta.y += __shfl_xor_sync(mask, linDelta.y, reductionRadius, numThreadsPerBody);
			linDelta.z += __shfl_xor_sync(mask, linDelta.z, reductionRadius, numThreadsPerBody);

			angDelta.x += __shfl_xor_sync(mask, angDelta.x, reductionRadius, numThreadsPerBody);
			angDelta.y += __shfl_xor_sync(mask, angDelta.y, reductionRadius, numThreadsPerBody);
			angDelta.z += __shfl_xor_sync(mask, angDelta.z, reductionRadius, numThreadsPerBody);
		}

		hasRef = (__ballot_sync(mask, hasRef) & mask);


		if (hasRef)
		{
			if (threadIdInWorkUnit == 0)
			{
				// Counting the number of active slabs
				const PxU32 numDynamicBodies = solverDesc->islandContextPool->mBodyCount; //nbBodies minus offset!
				const PxU32 numArticulations = solverDesc->islandContextPool->mArticulationCount;
				const PxU32 numTotalBodies = bodyOffset + numDynamicBodies + numArticulations;

				const PxU32 referenceCount =
				    countActiveSlabs(bodyOffset + bodyId, solverDesc->numSlabs, numTotalBodies, encodedReferenceCount);
				const PxReal recipRefs = 1.f / static_cast<PxReal>(referenceCount);

				linDelta = linDelta * recipRefs;
				angDelta = angDelta * recipRefs;

				// Resetting rigid body reference count
				resetSlabCount(bodyOffset + bodyId, solverDesc->numSlabs, numTotalBodies, encodedReferenceCount);

				//Store linear and angular velocity!!!
				bodyVelocities[outputBody] = lastLinearVel + linDelta;
				bodyVelocities[outputBody + totalBodyCount + bodyOffset] = lastAngularVel + angDelta;
			}
		}
	}
}

extern "C" __global__ void propagateSolverBodyVelocity(
	const PxgSolverCoreDesc* solverDesc,
	const PxgSolverSharedDesc<IterativeSolveData>* sharedDesc)
{
	//we need to fill in the writeIndex for the solver body data
	PxgSolverReferences* solverReferences = solverDesc->solverBodyReferences;
	float4* bodyVelocities = sharedDesc->iterativeData.solverBodyVelPool;
	const PxU32 totalBodyCount = solverDesc->islandContextPool->mBodyCount; //nbBodies minus offset!
	const PxU32 bodyOffset = solverDesc->islandContextPool->mBodyStartIndex; //nbBodies minus offset!
	const PxU32 numSlabs = solverDesc->numSlabs;
	const PxU32 deltaVOffset = solverDesc->accumulatedBodyDeltaVOffset;

	//This identifies which warp a specific thread is in
	const uint threadIndex = (threadIdx.x + blockIdx.x * blockDim.x);

	const PxU32 numThreadsPerBody = PxMin(isPowerOfTwo(numSlabs) ? numSlabs : nextPowerOfTwo(numSlabs), 32u);
	const PxU32 numBodiesPerWarp = WARP_SIZE / numThreadsPerBody;

	const PxU32 warpIndex = threadIndex / WARP_SIZE;

	const uint warpBodyStartIndex = warpIndex * numBodiesPerWarp;

	const PxU32 threadIdInWorkUnit = threadIdx.x&(numThreadsPerBody - 1);

	const PxU32 bodyInWarp = (threadIdx.x&31) / numThreadsPerBody;

	const uint bodyId = warpBodyStartIndex + bodyInWarp;

	if(bodyId < totalBodyCount)
	{
		const PxU32 outputBody = deltaVOffset + bodyId + bodyOffset;

		//Store linear and angular velocity!!!
		float4 linVel = bodyVelocities[outputBody];
		float4 angVel = bodyVelocities[outputBody + totalBodyCount + bodyOffset];
		linVel.w = 0.f;

		for(uint b=threadIdInWorkUnit; b<numSlabs; b += numThreadsPerBody)
		{
			const uint bodyIndex = (bodyId)*numSlabs + b;

			PxgSolverReferences reference = solverReferences[bodyIndex];
			PxU32 remappedBodyIndex = reference.mRemappedBodyIndex;
			if(remappedBodyIndex != 0xFFFFFFFF)
			{
				const uint velIndex = ComputeBodyBatchStartIndex(remappedBodyIndex);
				bodyVelocities[velIndex] = linVel;
				bodyVelocities[velIndex + 32] = angVel;
			}
		}
	}
}

extern "C" __global__ void dmaBackChangedElems(const PxgSolverCoreDesc* solverDesc, Dy::ThresholdStreamElement* hostChangedElems)
{
	Dy::ThresholdStreamElement* changeElems = solverDesc->forceChangeThresholdElements;
	PxU32 nbElemsChanges = solverDesc->nbForceChangeElements;

	PxU32 nbThreadsRequired = (sizeof(Dy::ThresholdStreamElement) * nbElemsChanges)/sizeof(PxU32);

	PxU32* src = reinterpret_cast<PxU32*>(changeElems);
	PxU32* dst = reinterpret_cast<PxU32*>(hostChangedElems);

	PxU32 globalThreadIdx = threadIdx.x + blockIdx.x * blockDim.x;

	for(PxU32 i = globalThreadIdx; i < nbThreadsRequired; i+= blockDim.x * gridDim.x)
	{
		dst[i] = src[i];
	}
}

extern "C" __global__ void dmaConstraintResidual(const PxgConstraintWriteback* writebacks, PxReal* residuals, PxU32 count)
{
	PxU32 globalThreadIdx = threadIdx.x + blockIdx.x * blockDim.x;

	if (globalThreadIdx < count)
	{
		residuals[globalThreadIdx] = writebacks[globalThreadIdx].angularImpulse_residual.w;
	}
}

extern "C" __global__
//__launch_bounds__(PxgKernelBlockDim::SOLVE_BLOCK_PARTITION, 16)
void solveStaticBlock(
	PxgSolverCoreDesc* PX_RESTRICT solverDesc, const PxgSolverSharedDesc<IterativeSolveData>* PX_RESTRICT sharedDesc,
	const PxU32 islandIndex, const PxU32 nbStaticSlabs, const PxU32 maxStaticPartitions, bool doFriction)
{
	const PxgIslandContext& island = solverDesc->islandContextPool[islandIndex];
	const IterativeSolveData& iterativeData = sharedDesc->iterativeData;

	float4* PX_RESTRICT bodyVelocities = iterativeData.solverBodyVelPool;
	float4* PX_RESTRICT bodyOutVelocities = iterativeData.tempStaticBodyOutputPool;
	const PxU32 numDynamicBodies = island.mBodyCount; //nbBodies minus offset!
	const PxU32 bodyOffset = island.mBodyStartIndex;
	const PxU32 deltaVOffset = solverDesc->accumulatedBodyDeltaVOffset;
	const PxU32 totalBodiesIncKinematics = numDynamicBodies + bodyOffset;

	const PxU32 numDynamicBodiesRounded = (island.mBodyCount + 31)&(~31); //Rounded to a multiple of 32

	const uint warpSize = 32;

	//This identifies which warp a specific thread is in
	const uint globalThreadIdx = (threadIdx.x + blockIdx.x * blockDim.x);

	//This identifies which thread within a warp a specific thread is
	const uint threadIndexInWarp = threadIdx.x&(warpSize - 1);

	const PxU32 nbDynamicBodiesToSolve = numDynamicBodiesRounded * nbStaticSlabs;

	const uint bodyIndex = globalThreadIdx % numDynamicBodiesRounded;
	const PxU32 slabIdx = (globalThreadIdx / numDynamicBodiesRounded);
	const PxU32 startIndex = slabIdx * maxStaticPartitions;

	bool residualAccumulationEnabled = solverDesc->contactErrorAccumulator.mCounter >= 0;
	PxgErrorAccumulator error;

	if (globalThreadIdx < nbDynamicBodiesToSolve)
	{
		if (bodyIndex < numDynamicBodies)
		{
			PxU32 contactCount = PxMin(maxStaticPartitions, PxMax(startIndex, solverDesc->mRigidStaticContactCounts[bodyIndex]) - startIndex);
			PxU32 jointCount = PxMin(maxStaticPartitions, PxMax(startIndex, solverDesc->mRigidStaticJointCounts[bodyIndex]) - startIndex);

			const PxU32 outputBody = slabIdx * totalBodiesIncKinematics * 2;

			//printf("BodyIndex = %i, globalThreadIdx = %i, nbDynamicBodiesToSolve = %i, nbStaticSlabs = %i, contactCount = %i, startIndex = %i, outputBody = %i\n", bodyIndex, globalThreadIdx,
			//	nbDynamicBodiesToSolve, nbStaticSlabs, contactCount, startIndex, outputBody);

			if (contactCount != 0 || jointCount != 0)
			{
				//We have some constraints to solve...

				const PxU32 startContactIndex = solverDesc->mRigidStaticContactStartIndices[bodyIndex] + startIndex;
				const PxU32 startJointIndex = solverDesc->mRigidStaticJointStartIndices[bodyIndex] + startIndex;

				assert(startContactIndex >= solverDesc->numBatches);

				const PxU32 inputBody = deltaVOffset + bodyIndex + bodyOffset;

				//Load in velocity data...
				float4 linVel = bodyVelocities[inputBody];
				float4 angVel = bodyVelocities[inputBody + totalBodiesIncKinematics];

				PxVec3 lv0(linVel.x, linVel.y, linVel.z);
				PxVec3 lv1(0.f);
				PxVec3 av0(angVel.x, angVel.y, angVel.z);
				PxVec3 av1(0.f);

				for (PxU32 i = 0; i < jointCount; ++i)
				{
					const PxgBlockConstraintBatch& batch = iterativeData.blockConstraintBatch[startJointIndex + i];

					assert(batch.constraintType == PxgSolverConstraintDesc::eCONSTRAINT_1D);

					PxU32 idx = warpScanExclusive(batch.mask, threadIndexInWarp);

					// For interaction with static objects, mass-splitting is not used; thus, reference counts are 1.
					solve1DBlock(batch, lv0, av0, lv1, av1, idx, iterativeData.blockJointConstraintHeaders, iterativeData.blockJointConstraintRowsCon,
						iterativeData.blockJointConstraintRowsMod, solverDesc->contactErrorAccumulator.mCounter >= 0, 1.f, 1.f);
				}

				for (PxU32 i = 0; i < contactCount; ++i)
				{
					const PxgBlockConstraintBatch& batch = iterativeData.blockConstraintBatch[startContactIndex + i];

					assert(batch.constraintType == PxgSolverConstraintDesc::eCONTACT);

					PxU32 idx = warpScanExclusive(batch.mask, threadIndexInWarp);

					// For interaction with static objects, mass-splitting is not used; thus, reference counts are 1.
					solveContactBlock(batch, lv0, av0, lv1, av1, doFriction, idx, iterativeData.blockContactHeaders, iterativeData.blockFrictionHeaders,
						iterativeData.blockContactPoints, iterativeData.blockFrictions, residualAccumulationEnabled ? &error : NULL, 
						1.f, 1.f);
				}

				//if (globalThreadIdx == 33)
				////if(startContactIndex == (solverDesc->numBatches))
				////if(warpScanExclusive(iterativeData.blockConstraintBatch[startContactIndex].mask, threadIndexInWarp) == 0)
				//{
				//	printf("%i: NumContacts = %i, beforeVel = (%f, %f, %f), afterlinVel = (%f, %f, %f), lv1 (%f, %f, %f), av1(%f, %f, %f), startContactIndex = %i, numBatches = %i\n", 
				//		globalThreadIdx, contactCount, linVel.x, linVel.y, linVel.z, lv0.x, lv0.y, lv0.z, lv1.x, lv1.y, lv1.z, av1.x, av1.y, av1.z, startContactIndex, solverDesc->numBatches);
				//}

				linVel.x = lv0.x; linVel.y = lv0.y; linVel.z = lv0.z;
				angVel.x = av0.x; angVel.y = av0.y; angVel.z = av0.z;

				/*printf("%i: BodyOutVelocities[%i] = (%f, %f, %f, %f), bodyOutVelocities[%i] = (%f, %f, %f, %f)\n",
					globalThreadIdx, bodyIndex + outputBody, linVel.x, linVel.y, linVel.z, linVel.w,
					bodyIndex + outputBody + totalBodiesIncKinematics, angVel.x, angVel.y, angVel.z, angVel.w);*/

				bodyOutVelocities[bodyIndex + outputBody] = linVel;
				bodyOutVelocities[bodyIndex + outputBody + totalBodiesIncKinematics] = angVel;

			}
		}
	}
	if (residualAccumulationEnabled)
	{
		error.accumulateErrorGlobalFullWarp(solverDesc->contactErrorAccumulator, threadIndexInWarp);
	}
}


extern "C" __global__
//__launch_bounds__(PxgKernelBlockDim::SOLVE_BLOCK_PARTITION, 16)
void propagateStaticSolverBodyVelocities(
	const PxgSolverCoreDesc* PX_RESTRICT solverDesc, const PxgSolverSharedDesc<IterativeSolveData>* PX_RESTRICT sharedDesc,
	const PxU32 islandIndex, const PxU32 nbStaticSlabs, const PxU32 maxStaticPartitions)
{
	const PxgIslandContext& island = solverDesc->islandContextPool[islandIndex];
	const IterativeSolveData& iterativeData = sharedDesc->iterativeData;

	float4* PX_RESTRICT bodyVelocities = iterativeData.solverBodyVelPool;
	float4* PX_RESTRICT bodyOutVelocities = iterativeData.tempStaticBodyOutputPool;
	const PxU32 numDynamicBodies = island.mBodyCount; //nbBodies minus offset!
	const PxU32 bodyOffset = island.mBodyStartIndex;
	const PxU32 deltaVOffset = solverDesc->accumulatedBodyDeltaVOffset;
	const PxU32 totalBodiesIncKinematics = numDynamicBodies + bodyOffset;

	//This identifies which warp a specific thread is in
	const uint globalThreadIdx = (threadIdx.x + blockIdx.x * blockDim.x);

	PxU32 contactCount = 0, jointCount = 0;

	if (globalThreadIdx < numDynamicBodies)
	{
		contactCount = solverDesc->mRigidStaticContactCounts[globalThreadIdx];
		jointCount = solverDesc->mRigidStaticJointCounts[globalThreadIdx];

		if (contactCount || jointCount)
		{
			const PxU32 maxOutputs = PxMax(contactCount, jointCount);

			//We have velocity changes we need to propagate!
			const PxU32 outputBody = deltaVOffset + globalThreadIdx + bodyOffset;

			PxReal scale = 0.f;
			float4 vel0 = make_float4(0.f);
			float4 vel1 = make_float4(0.f);
			for (PxU32 i = 0, index = globalThreadIdx; i < maxOutputs; i += maxStaticPartitions,
				index += totalBodiesIncKinematics * 2, scale += 1.0f)
			{
				//We have velocity changes we need to propagate!
				vel0 += bodyOutVelocities[index];
				vel1 += bodyOutVelocities[index + totalBodiesIncKinematics];
			}

			scale = 1.f / scale;

			/*printf("BodyOutVelocities[%i] = (%f, %f, %f, %f), bodyOutVelocities[%i] = (%f, %f, %f, %f), scale = %f\n",
				globalThreadIdx + bodyOffset, vel0.x, vel0.y, vel0.z, vel0.w,
				globalThreadIdx + bodyOffset + totalBodiesIncKinematics, vel1.x, vel1.y, vel1.z, vel1.w, scale);*/

			bodyVelocities[outputBody] = vel0* scale;
			bodyVelocities[outputBody + totalBodiesIncKinematics] = vel1* scale;
		}
	}
}