XCEngine/engine/third_party/physx/source/gpuarticulation/src/CUDA/internalConstraints2.cu

// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of NVIDIA CORPORATION nor the names of its
//    contributors may be used to endorse or promote products derived
//    from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.

#include "foundation/PxMath.h"
#include "foundation/PxSimpleTypes.h"

#include "DyCpuGpuArticulation.h"
#include "DyFeatherstoneArticulation.h"
#include "DyFeatherstoneArticulationUtils.h"
#include "PxSpatialMatrix.h"
#include "PxgArticulationCoreDesc.h"
#include "PxgArticulationCoreKernelIndices.h"
#include "PxgSolverConstraintDesc.h"
#include "PxgSolverCoreDesc.h"
#include "articulationDynamic.cuh"
#include "articulationImpulseResponse.cuh"
#include "reduction.cuh"
#include "solver.cuh"
#include "solverBlock.cuh"
#include "solverBlockTGS.cuh"
#include "utils.cuh"

using namespace physx;
using namespace Dy;

extern "C" __host__ void initArticulationKernels2() {}

static __device__ Cm::UnAlignedSpatialVector propagateImpulseDofAligned(const Cm::UnAlignedSpatialVector& isInvD, const PxVec3& childToParent,
	const Cm::UnAlignedSpatialVector& sa, const Cm::UnAlignedSpatialVector& Z, PxReal& qstZ)
{
	const PxReal stZ = sa.innerProduct(Z);
	const Cm::UnAlignedSpatialVector temp = isInvD * stZ;
	qstZ = -stZ;

	//parent space's spatial zero acceleration impulse
	return FeatherstoneArticulation::translateSpatialVector(childToParent, (Z - temp));
}

static __device__ void getImpulseSelfResponseDofAligned(
	const Cm::UnAlignedSpatialVector& impulse0,
	const Cm::UnAlignedSpatialVector& impulse1,
	Cm::UnAlignedSpatialVector& deltaV0,
	Cm::UnAlignedSpatialVector& deltaV1,
	const PxgArticulationBlockDofData& thisDof,
	const PxgArticulationBlockDofData* dofData,
	const PxVec3& childToParent,
	const PxSpatialMatrix& parentSpatialResponse,
	const Cm::UnAlignedSpatialVector& thisMotionVector,
	const PxU32 dofCount,
	const PxU32 dofIndex,
	const PxU32 threadIndexInWarp)
{

	Cm::UnAlignedSpatialVector Z1W(-impulse1.top, -impulse1.bottom);

	const Cm::UnAlignedSpatialVector isInvD = loadSpatialVector(thisDof.mIsInvDW, threadIndexInWarp);

	PxReal qstZ[3] = { 0.f, 0.f, 0.f };
	Cm::UnAlignedSpatialVector Z0W = propagateImpulseDofAligned(isInvD, childToParent, thisMotionVector, Z1W, qstZ[dofIndex]);

	const Cm::UnAlignedSpatialVector impulseDifW = impulse0 - Z0W;

	const Cm::UnAlignedSpatialVector deltaV0W = (parentSpatialResponse * impulseDifW);

	deltaV1 = propagateAccelerationW(childToParent, dofData, deltaV0W, dofCount, threadIndexInWarp, qstZ);

	deltaV0 = deltaV0W;
}

static __device__ void getImpulseSelfResponse(
	const Cm::UnAlignedSpatialVector& impulse0,
	const Cm::UnAlignedSpatialVector& impulse1,
	Cm::UnAlignedSpatialVector& deltaV0,
	Cm::UnAlignedSpatialVector& deltaV1,
	const PxgArticulationBlockDofData* dofData,
	const PxVec3& childToParent,
	const PxSpatialMatrix& parentSpatialResponse,
	const PxU32 dofCount,
	const PxU32 threadIndexInWarp)
{

	Cm::UnAlignedSpatialVector Z1W(-impulse1.top, -impulse1.bottom);

	PxReal qstZ[3] = { 0.f, 0.f, 0.f };
	Cm::UnAlignedSpatialVector Z0W = propagateImpulseW_1(childToParent, dofData, Z1W, NULL, dofCount, threadIndexInWarp, qstZ);

	const Cm::UnAlignedSpatialVector impulseDifW = impulse0 - Z0W;

	const Cm::UnAlignedSpatialVector deltaV0W = (parentSpatialResponse * impulseDifW);

	deltaV1 = propagateAccelerationW(childToParent, dofData, deltaV0W, dofCount, threadIndexInWarp, qstZ);

	deltaV0 = deltaV0W;
}

static __device__ void setupInternalConstraints(PxgArticulationBlockData& artiBlock,
	PxgArticulationBlockLinkData* PX_RESTRICT artiLinks,
	PxgArticulationBlockDofData* PX_RESTRICT artiDofs,
	const PxReal stepDt, const PxReal dt, const PxReal invDt, bool isTGSSolver, const PxU32 threadIndexInWarp)
{
	const PxU32 numLinks = artiBlock.mNumLinks[threadIndexInWarp];

	const PxReal maxForceScale = artiBlock.mFlags[threadIndexInWarp] & PxArticulationFlag::eDRIVE_LIMITS_ARE_FORCES ? dt : 1.f;

	//KS - we skip link 0 because it does not have any joints

	PxgArticulationBlockDofData* PX_RESTRICT dofs = artiDofs;

	__shared__ char sDriveError[sizeof(PxVec3) * WARP_SIZE];
	PxVec3* driveError = reinterpret_cast<PxVec3*>(sDriveError);


	for (PxU32 linkID = 1; linkID < numLinks; ++linkID)
	{
		PxgArticulationBlockLinkData& link = artiLinks[linkID];

		const PxU32 nbDofs = link.mDofs[threadIndexInWarp];
		const PxU32 parent = link.mParents[threadIndexInWarp];

		PxSpatialMatrix parentResponse;

		PxTransform cA2w;
		PxTransform cB2w;
		const float c2px = link.mRw_x[threadIndexInWarp];
		const float c2py = link.mRw_y[threadIndexInWarp];
		const float c2pz = link.mRw_z[threadIndexInWarp];
		PxVec3 child2Parent(c2px, c2py, c2pz);
		PxReal transmissionForce;
		PxReal cfm;

		bool loaded = false;

		//KS - maxFrictionForce stores the friction coefficient...
		if (nbDofs)
		{
			PxReal frictionCoefficient = dofs->mConstraintData.mFrictionCoefficient[threadIndexInWarp] * stepDt;

			PxU32 jointType = link.mJointType[threadIndexInWarp];
			const bool isAngularConstraint = jointType == PxArticulationJointType::eREVOLUTE || jointType == PxArticulationJointType::eREVOLUTE_UNWRAPPED || jointType == PxArticulationJointType::eSPHERICAL;

			if (isAngularConstraint)
			{
				if (nbDofs > 1)
				{
					const float4 cRot = link.mAccumulatedPose.q[threadIndexInWarp];
					const float4 pRot = artiLinks[parent].mAccumulatedPose.q[threadIndexInWarp];
					const float4 parentQ = link.mParentPose.q[threadIndexInWarp];
					const float4 childQ = link.mChildPose.q[threadIndexInWarp];

					const PxQuat cA2w = PxQuat(pRot.x, pRot.y, pRot.z, pRot.w) * PxQuat(parentQ.x, parentQ.y, parentQ.z, parentQ.w);
					const PxQuat cB2w = PxQuat(cRot.x, cRot.y, cRot.z, cRot.w) * PxQuat(childQ.x, childQ.y, childQ.z, childQ.w);

					PxQuat qB2qA = cA2w.getConjugate() * cB2w;

					PxVec3 driveAxis(0.f);

					bool hasAngularDrives = false;

					for (PxU32 i = 0; i < nbDofs; ++i)
					{
						bool hasDrive = dofs[i].mMotion[threadIndexInWarp] != PxArticulationMotion::eLOCKED &&
							dofs[i].mConstraintData.mDriveType[threadIndexInWarp] != PxArticulationDriveType::eNONE;

						if (hasDrive)
						{
							float4 top = dofs[i].mLocalMotionMatrix.mTopxyz_bx[threadIndexInWarp];
							PxReal targetPos = dofs[i].mConstraintData.mDriveTargetPos[threadIndexInWarp];
							driveAxis += PxVec3(top.x, top.y, top.z)*targetPos;
							hasAngularDrives = true;
						}
					}

					if (hasAngularDrives)
					{
						PxReal angle = driveAxis.normalize();

						if (angle < 1e-12f)
						{
							driveAxis = PxVec3(1.f, 0.f, 0.f);
							angle = 0.f;
						}

						PxQuat targetQ = PxQuat(angle, driveAxis);

						if (targetQ.dot(qB2qA) < 0.f)
							targetQ = -targetQ;

						driveError[threadIndexInWarp] = -2.f * (targetQ.getConjugate() * qB2qA).getImaginaryPart();
					}
				}
				else
				{
					if (dofs[0].mMotion[threadIndexInWarp] != PxArticulationMotion::eLOCKED)
					{
						PxU32 i = dofs[0].mDofIds[threadIndexInWarp];
						const PxReal jointPos = dofs[0].mJointPositions[threadIndexInWarp];
						driveError[threadIndexInWarp][i] = dofs[0].mConstraintData.mDriveTargetPos[threadIndexInWarp] - jointPos;
					}
				}
			}

			for (PxU32 i = 0; i < nbDofs; ++i)
			{
				PxU32 motion = dofs[i].mMotion[threadIndexInWarp];

				PxU32 dofId = dofs[i].mDofIds[threadIndexInWarp];
				{
					const PxReal maxForce = dofs[i].mConstraintData.mMaxForce[threadIndexInWarp];
					const PxReal maxEffort = dofs[i].mConstraintData.mMaxEffort[threadIndexInWarp];

					PxReal stiffness = dofs[i].mConstraintData.mDriveStiffness[threadIndexInWarp];
					PxReal damping = dofs[i].mConstraintData.mDamping[threadIndexInWarp];

					if (!loaded)
					{
						loadSpatialMatrix(artiLinks[parent].mSpatialResponseMatrix, threadIndexInWarp, parentResponse);
						Cm::UnAlignedSpatialVector bias = loadSpatialVector(link.mBiasForce, threadIndexInWarp);
						transmissionForce = bias.magnitude() * frictionCoefficient;
						cfm = PxMax(artiLinks[parent].mCfm[threadIndexInWarp], link.mCfm[threadIndexInWarp]);
						loaded = true;
					}

					//Generate the response vectors...

					Cm::UnAlignedSpatialVector worldMotionVector;

					Cm::UnAlignedSpatialVector axis0, axis1;
					bool isLinear = dofId >= PxArticulationAxis::eX;
					PxReal position = dofs[i].mJointPositions[threadIndexInWarp];
					PxReal error;
					if (!isLinear)
					{
						worldMotionVector = loadSpatialVector(dofs[i].mWorldMotionMatrix, threadIndexInWarp);
						//Angular constraint...
						axis0.top = PxVec3(0.f);
						axis0.bottom = worldMotionVector.top;
						axis1.top = PxVec3(0.f);
						axis1.bottom = -worldMotionVector.top;
						error = driveError[threadIndexInWarp][dofId];
					}
					else
					{
						worldMotionVector = loadSpatialVector(dofs[i].mWorldMotionMatrix, threadIndexInWarp);
						//KS - the following are only required for linear constraints or lockes axes...
						const float4 cRot = link.mAccumulatedPose.q[threadIndexInWarp];
						const float4 pRot = artiLinks[parent].mAccumulatedPose.q[threadIndexInWarp];

						const float4 parentP = link.mParentPose.p[threadIndexInWarp];
						const float4 childP = link.mChildPose.p[threadIndexInWarp];
						const PxVec3 cA2w = PxQuat(pRot.x, pRot.y, pRot.z, pRot.w).rotate(PxVec3(parentP.x, parentP.y, parentP.z));
						const PxVec3 cB2w = PxQuat(cRot.x, cRot.y, cRot.z, cRot.w).rotate(PxVec3(childP.x, childP.y, childP.z));

						const PxVec3 axis = worldMotionVector.bottom;
						const PxVec3 ang0 = cA2w.cross(axis);
						const PxVec3 ang1 = cB2w.cross(axis);

						axis0.top = axis;
						axis0.bottom = ang0;
						axis1.top = -axis;
						axis1.bottom = -ang1;

						error = dofs[0].mConstraintData.mDriveTargetPos[threadIndexInWarp] - position;
					}

					Cm::UnAlignedSpatialVector deltaVA, deltaVB;

					getImpulseSelfResponseDofAligned(axis0, axis1,
						deltaVA, deltaVB, dofs[i], dofs, child2Parent, parentResponse,
						worldMotionVector, nbDofs, i, threadIndexInWarp);

					/*getImpulseSelfResponse(artiLinks[parent], link, axis0, axis1, deltaVA, deltaVB, dofs, child2Parent, parentResponse,
						childInertia, nbDofs, threadIndexInWarp);*/

					const PxReal r0 = deltaVA.innerProduct(axis0);
					const PxReal r1 = deltaVB.innerProduct(axis1);

					const PxReal unitResponse = r1 + r0;

					const PxReal recipResponse = 1.0f / (unitResponse + cfm);

					dofs[i].mConstraintData.mMaxFrictionForce[threadIndexInWarp] = transmissionForce;

					//Set up drives...
					if (motion == PxArticulationMotion::eLIMITED)
					{
						float2 limit = dofs[i].mConstraintData.mLimits_LowLimitX_highLimitY[threadIndexInWarp];

						dofs[i].mConstraintData.mLimitError_LowX_highY[threadIndexInWarp] = make_float2(position - limit.x, limit.y - position);
					}

					//Set up drive...
					if ((maxForce > 0.0f || maxEffort > 0.0f) && (stiffness > 0.0f || damping > 0.0f))
					{
						const PxReal targetVelocity = dofs[i].mConstraintData.mDriveTargetVel[threadIndexInWarp];
						const PxArticulationDriveType::Enum type = PxArticulationDriveType::Enum(dofs[i].mConstraintData.mDriveType[threadIndexInWarp]);
						dofs[i].mConstraintData.setImplicitDriveDesc(threadIndexInWarp,
							computeImplicitDriveParams(
									type, stiffness, damping,
									isTGSSolver ? stepDt : dt, dt,
									unitResponse, recipResponse,
									error, targetVelocity,
									isTGSSolver));
						dofs[i].mConstraintData.mConstraintMaxImpulse[threadIndexInWarp] = maxForce * maxForceScale;
						}
					else
					{
						//Zero drives...
						dofs[i].mConstraintData.setImplicitDriveDesc(threadIndexInWarp, ArticulationImplicitDriveDesc(PxZero));
						dofs[i].mConstraintData.mConstraintMaxImpulse[threadIndexInWarp] = 0.f;
					}
					dofs[i].mConstraintData.mDriveImpulse[threadIndexInWarp] = 0.f;
					storeSpatialVector(dofs[i].mConstraintData.mDeltaVA, deltaVA, threadIndexInWarp);
					storeSpatialVector(dofs[i].mConstraintData.mDeltaVB, deltaVB, threadIndexInWarp);
					storeSpatialVector(dofs[i].mConstraintData.mRow0, axis0, threadIndexInWarp);
					storeSpatialVector(dofs[i].mConstraintData.mRow1, -axis1, threadIndexInWarp);
					dofs[i].mConstraintData.mRecipResponse[threadIndexInWarp] = recipResponse;
					dofs[i].mConstraintData.mResponse[threadIndexInWarp] = unitResponse;
					dofs[i].mConstraintData.mAccumulatedFrictionImpulse[threadIndexInWarp] = 0.0f;
					dofs[i].mConstraintData.mLowImpulse[threadIndexInWarp] = 0.f;
					dofs[i].mConstraintData.mHighImpulse[threadIndexInWarp] = 0.f;
				}
			}
		}
		dofs += nbDofs;
	}
}

static __device__ void loadSpatialVectorsForPropagationInwardsAndOutwards
(const PxgArticulationBlockDofData* PX_RESTRICT artiDofs, const PxU32 jointOffset, const PxU32 nbDofs, const PxU32 threadIndexInWarp,
 Cm::UnAlignedSpatialVector* PX_RESTRICT motionMatrixW, Cm::SpatialVectorF* PX_RESTRICT IsInvSTISW, Cm::SpatialVectorF* PX_RESTRICT ISW, InvStIs& invSTISW)
{
	//Gather some important terms in the form we need.
	//Note propagateAccelerationW cpu/gpu computes jAccel[i] = invSTISW[0][i] * j[0] + invSTISW[1][i] * j[1] + invSTISW[2][i] * j[2].
	//Note 	propagateAccelerationWNoJVelUpdate computes jAccel[i] = invStIsT[i].x * j[0] + invStIsT[i].y * j[1] + invStIsT[i].z * j[2]
	//We can conclude:  invStIsT[i].x has index [0][i], invStIsT[i].y has index [1][i] and invStIsT[i].z has index [2][1]
	for(PxU32 i = 0; i < nbDofs; i++)
	{
		motionMatrixW[i] = loadSpatialVector(artiDofs[jointOffset + i].mWorldMotionMatrix, threadIndexInWarp);
		IsInvSTISW[i] = loadSpatialVectorF(artiDofs[jointOffset + i].mIsInvDW, threadIndexInWarp);
		invSTISW.invStIs[0][i] = artiDofs[jointOffset + i].mInvStIsT_x[threadIndexInWarp];
		invSTISW.invStIs[1][i] = artiDofs[jointOffset + i].mInvStIsT_y[threadIndexInWarp];
		invSTISW.invStIs[2][i] = artiDofs[jointOffset + i].mInvStIsT_z[threadIndexInWarp];
		ISW[i] = loadSpatialVectorF(artiDofs[jointOffset + i].mIsW, threadIndexInWarp);
	}
}

static __device__ void loadSpatialVectorsForPropagationInwards
(const PxgArticulationBlockDofData* PX_RESTRICT artiDofs, const PxU32 jointOffset, const PxU32 nbDofs, const PxU32 threadIndexInWarp,
 Cm::UnAlignedSpatialVector* PX_RESTRICT motionMatrixW, Cm::SpatialVectorF* IsInvSTISW)
{
	for(PxU32 i = 0; i < nbDofs; i++)
	{
		motionMatrixW[i] = loadSpatialVector(artiDofs[jointOffset + i].mWorldMotionMatrix, threadIndexInWarp);
		IsInvSTISW[i] = loadSpatialVectorF(artiDofs[jointOffset + i].mIsInvDW, threadIndexInWarp);
	}
}

static __device__ void loadSpatialVectorsForPropagationOutwards
(const PxgArticulationBlockDofData* PX_RESTRICT artiDofs, const PxU32 jointOffset, const PxU32 nbDofs, const PxU32 threadIndexInWarp,
 Cm::UnAlignedSpatialVector* PX_RESTRICT motionMatrixW, Cm::SpatialVectorF* ISW, InvStIs& invSTISW)
{
	//Note propagateAccelerationW cpu/gpu computes jAccel[i] = invSTISW[0][i] * j[0] + invSTISW[1][i] * j[1] + invSTISW[2][i] * j[2].
	//Note 	propagateAccelerationWNoJVelUpdate computes jAccel[i] = invStIsT[i].x * j[0] + invStIsT[i].y * j[1] + invStIsT[i].z * j[2]
	//We can conclude:  invStIsT[i].x has index [0][i], invStIsT[i].y has index [1][i] and invStIsT[i].z has index [2][1]
	for(PxU32 i = 0; i < nbDofs; i++)
	{
		motionMatrixW[i] = loadSpatialVector(artiDofs[jointOffset + i].mWorldMotionMatrix, threadIndexInWarp);
		invSTISW.invStIs[0][i] = artiDofs[jointOffset + i].mInvStIsT_x[threadIndexInWarp];
		invSTISW.invStIs[1][i] = artiDofs[jointOffset + i].mInvStIsT_y[threadIndexInWarp];
		invSTISW.invStIs[2][i] = artiDofs[jointOffset + i].mInvStIsT_z[threadIndexInWarp];
		ISW[i] = loadSpatialVectorF(artiDofs[jointOffset + i].mIsW, threadIndexInWarp);
	}
}

/**
\brief Compute the deltaQDot response of a joint dof to a unit joint impulse applied to that joint dof.
\param[in] linkIndex specifies the index of the child link of the joint under consideration.
\param[in] dof is the joint dof that will receive the test impulse.
\param[in] artiLinks has pre-computed values that will ber used in the computation.
\param[in] artiDofs has pre-computed values that will be used in the computation.
\param[in] threadIndexInWarp is an index in range (0,32) describing the index inside a warp of 32 threads.
\return The deltaQDot response of the specified joint and dof of a test joint impulse applied to the specified joint and dof.
\note dof is in range (0,3) because articulation joints only support 3 degrees of freedom.
*/
static __device__ PxReal computeMimicJointSelfResponse
(const PxU32 linkIndex, const PxU32 dof, const PxgArticulationBlockLinkData* PX_RESTRICT artiLinks, const PxgArticulationBlockDofData* PX_RESTRICT artiDofs, const PxU32 threadIndexInWarp)
{
	const PxU32 parentLinkIndex = artiLinks[linkIndex].mParents[threadIndexInWarp];

	//childLinkPos - parentLinkPos
	const float parentLinkToChildLinkx = artiLinks[linkIndex].mRw_x[threadIndexInWarp];
	const float parentLinkToChildLinky = artiLinks[linkIndex].mRw_y[threadIndexInWarp];
	const float parentLinkToChildLinkz = artiLinks[linkIndex].mRw_z[threadIndexInWarp];

	const PxU32 jointOffset = artiLinks[linkIndex].mJointOffset[threadIndexInWarp];
	const PxU8 dofCount = artiLinks[linkIndex].mDofs[threadIndexInWarp];

	const PxReal testJointImpulses[3][3] = {{1, 0, 0}, {0, 1, 0}, {0, 0, 1}};
	const PxReal* testJointImpulse = testJointImpulses[dof];

	//Gather terms from the gpu data layout.
	Cm::UnAlignedSpatialVector motionMatrixW[3];
	Cm::SpatialVectorF IsInvSTISW[3];
	Cm::SpatialVectorF ISW[3];
	InvStIs invSTISW;
	loadSpatialVectorsForPropagationInwardsAndOutwards(artiDofs, jointOffset, dofCount, threadIndexInWarp, motionMatrixW, IsInvSTISW, ISW, invSTISW);

	//(1) Propagate joint impulse (and zero link impulse) to parent
	PxReal QMinusStZ[3] = { 0.f, 0.f, 0.f };
	const Cm::SpatialVectorF Zp = propagateImpulseW(
		PxVec3(parentLinkToChildLinkx, parentLinkToChildLinky, parentLinkToChildLinkz),
		Cm::SpatialVectorF(PxVec3(0, 0, 0), PxVec3(0, 0, 0)),
		testJointImpulse, IsInvSTISW, motionMatrixW, dofCount,
		QMinusStZ);

	//(2) Get deltaV response for parent
	 Cm::SpatialVectorF deltaVParent;
	{
		PxSpatialMatrix mat;
		loadSpatialMatrix(artiLinks[parentLinkIndex].mSpatialResponseMatrix, threadIndexInWarp, mat);
		const Cm::UnAlignedSpatialVector ZpUnaligned(Zp.top, Zp.bottom);
		const Cm::UnAlignedSpatialVector deltaVParentUnagligned = mat * (-ZpUnaligned);
		deltaVParent = Cm::SpatialVectorF(deltaVParentUnagligned.top, deltaVParentUnagligned.bottom);
	}

	//(3) Propagate parent deltaV and apply test impulse (encoded in QMinusStZ).
	PxReal jointDeltaQDot[3]= {0, 0, 0};
	propagateAccelerationW(
		PxVec3(parentLinkToChildLinkx, parentLinkToChildLinky, parentLinkToChildLinkz), deltaVParent,
		invSTISW, motionMatrixW, ISW, QMinusStZ, dofCount,
		jointDeltaQDot);

	const PxReal jointSelfResponse = jointDeltaQDot[dof];
	return jointSelfResponse;
}

/**
\brief Compute the deltaQDot response of a joint dof given a unit impulse applied to a different joint and dof.
\param[in] linkA is the link whose inbound joint receives the test impulse.
\param[in] dofA is the relevant dof of the inbound joint of linkA.
\param[in] linkB is the link whose inbound joint receives the deltaQDot arising from the unit impulse applied to the inbound joint of linkA.
\param[in] dofB is the relevant dof of the the inbound joint of linkB.
\param[in] artiLinks has pre-computed values that will ber used in the computation.
\param[in] artiDofs has pre-computed values that will be used in the computation.
\param[in] artiPathToRootBitFields stores the bitfields describing the path to root for each link of an articulation.
\param[in] artiPathToRootBitFieldWordCount is the number of bitfields (with bitfield==PxU64) required to store the path to root for a link.
\param[in] threadIndexInWarp is an index in range (0,32) describing the index inside a warp of 32 threads.
\return The deltaQDot response of the specified joint and dof corresponding to linkB and dofB.
\note dofA and dofB are in range (0,3) because articulation joints only support 3 degrees of freedom.
\note artiDofs is not const because we cache temporary data in QMinusSTZ owned by artiDofs.
The data will be cleared at the end of the computation because the buffer is used later in the solver
to accumulate link impulses from contact and constraint.
*/
static __device__ PxReal computeMimicJointCrossResponse
(const PxU32 linkA, const PxU32 dofA, const PxU32 linkB, const PxU32 dofB,
 const PxgArticulationBlockData& artiBlock,
 const PxgArticulationBlockLinkData* PX_RESTRICT artiLinks,
 PxgArticulationBlockDofData* PX_RESTRICT artiDofs,
 const PxgArticulationBitFieldData* PX_RESTRICT artiPathToRootBitFields, const PxU32 artiPathToRootBitFieldWordCount,
 const PxU32 threadIndexInWarp)
{
	//Compute the test impulse to apply the inbound joint of linkA.
	const PxReal testJointImpulses[3][3] = {{1, 0, 0}, {0, 1, 0}, {0, 0, 1}};

	//Zero QMinusSTZ before using it.
	for(PxU32 i = 0; i < artiBlock.mTotalDofs[threadIndexInWarp]; i++)
		artiDofs[i].mDeferredQstZ[threadIndexInWarp] = 0.0f;

	//Iterate from linkA to root.
	Cm::SpatialVectorF Zp;
	for (PxI32 j = artiPathToRootBitFieldWordCount-1, bitOffset = (artiPathToRootBitFieldWordCount-1)*64; j >= 0; j--, bitOffset -= 64)
	{
		ArticulationBitField word = artiPathToRootBitFields[linkA*artiPathToRootBitFieldWordCount + j].bitField[threadIndexInWarp];
		while (word)
		{
			const PxU32 bitIndex = articulationHighestSetBit(word);
			const PxU32 linkIndex = bitIndex + bitOffset;
			word &= (~(1ull << bitIndex)); //Clear this bit

			//The root is included in pathToRoot but we cannot propagate an impulse upwards from the root.
			if(0 != linkIndex)
			{
				const PxReal* jointImpulse = NULL;
				Cm::SpatialVectorF linkImpulse(PxVec3(0,0,0), PxVec3(0,0,0));
				if(linkA == linkIndex)
				{
					//Propagate joint impulse to parent
					jointImpulse = testJointImpulses[dofA];
					linkImpulse = Cm::SpatialVectorF(PxVec3(0,0,0), PxVec3(0,0,0));
				}
				else
				{
					//Propagate link impulse to parent.
					jointImpulse = NULL;
					linkImpulse = Zp;
				}

				//childLinkPos - parentLinkPos
				const float parentLinkToChildLinkx = artiLinks[linkIndex].mRw_x[threadIndexInWarp];
				const float parentLinkToChildLinky = artiLinks[linkIndex].mRw_y[threadIndexInWarp];
				const float parentLinkToChildLinkz = artiLinks[linkIndex].mRw_z[threadIndexInWarp];

				const PxU32 jointOffset = artiLinks[linkIndex].mJointOffset[threadIndexInWarp];
				const PxU8 dofCount = artiLinks[linkIndex].mDofs[threadIndexInWarp];

				Cm::UnAlignedSpatialVector motionMatrixW[3];
				Cm::SpatialVectorF IsInvSTISW[3];
				loadSpatialVectorsForPropagationInwards(artiDofs, jointOffset, dofCount, threadIndexInWarp, motionMatrixW, IsInvSTISW);

				PxReal QMinusSTZ[3] = {0, 0, 0};
				Zp = propagateImpulseW(
						PxVec3(parentLinkToChildLinkx, parentLinkToChildLinky, parentLinkToChildLinkz),
						linkImpulse,
						jointImpulse, IsInvSTISW, motionMatrixW, dofCount,
						QMinusSTZ);

				//Copy QMinusSTZ to persistent array so we can use it again when we propagate deltaV downwards.
				for(PxU32 i = 0; i < dofCount; i++)
					artiDofs[jointOffset + i].mDeferredQstZ[threadIndexInWarp] = QMinusSTZ[i];
			}
		}
	}

	//(2) Get deltaV response for root
	 Cm::SpatialVectorF deltaVRoot;
	{
		PxSpatialMatrix mat;
		loadSpatialMatrix(artiLinks[0].mSpatialResponseMatrix, threadIndexInWarp, mat);
		const Cm::UnAlignedSpatialVector ZpUnaligned(Zp.top, Zp.bottom);
		const Cm::UnAlignedSpatialVector deltaVRootUnagligned = mat * (-ZpUnaligned);
		deltaVRoot = Cm::SpatialVectorF(deltaVRootUnagligned.top, deltaVRootUnagligned.bottom);
	}

	//Propagate deltaVRoot from root to linkB.
	PxReal jointVelocity[3] = {0, 0, 0};
	Cm::SpatialVectorF deltaVParent = deltaVRoot;
	for (PxI32 j = 0, bitOffset = 0; j < artiPathToRootBitFieldWordCount; j++, bitOffset += 64)
	{
		ArticulationBitField word = artiPathToRootBitFields[linkB*artiPathToRootBitFieldWordCount + j].bitField[threadIndexInWarp];
		while (word)
		{
			const PxU32 bitIndex = articulationLowestSetBit(word);
			const PxU32 linkIndex = bitIndex + bitOffset;
			word &= (~(1ull << bitIndex)); //Clear this bit

			//The root is included in pathToRoot but we cannot propagate an impulse upwards from the root.
			if(linkIndex != 0)
			{
				//childLinkPos - parentLinkPos
				const float parentLinkToChildLinkx = artiLinks[linkIndex].mRw_x[threadIndexInWarp];
				const float parentLinkToChildLinky = artiLinks[linkIndex].mRw_y[threadIndexInWarp];
				const float parentLinkToChildLinkz = artiLinks[linkIndex].mRw_z[threadIndexInWarp];

				const PxU32 jointOffset = artiLinks[linkIndex].mJointOffset[threadIndexInWarp];
				const PxU8 dofCount = artiLinks[linkIndex].mDofs[threadIndexInWarp];

				Cm::UnAlignedSpatialVector motionMatrixW[3];
				Cm::SpatialVectorF ISW[3];
				InvStIs invSTISW;
				loadSpatialVectorsForPropagationOutwards(artiDofs, jointOffset, dofCount, threadIndexInWarp, motionMatrixW, ISW, invSTISW);

				//Load the persistent QMinusSTZ that we cached when propagating upwards.
				PxReal QMinusSTZ[3] = {0, 0, 0};
				for(PxU32 i = 0; i < dofCount; i++)
				{
					QMinusSTZ[i] = artiDofs[jointOffset + i].mDeferredQstZ[threadIndexInWarp];
				}

				PxReal childJointSpeed[3] = {0, 0, 0};
				deltaVParent = propagateAccelerationW(
					PxVec3(parentLinkToChildLinkx, parentLinkToChildLinky, parentLinkToChildLinkz), deltaVParent,
					invSTISW, motionMatrixW, ISW, QMinusSTZ, dofCount,
					childJointSpeed);

				jointVelocity[0] = childJointSpeed[0];
				jointVelocity[1] = childJointSpeed[1];
				jointVelocity[2] = childJointSpeed[2];
			}
		}
	}

	//Zero QMinusSTZ before exiting.
	for(PxU32 i = 0; i < artiBlock.mTotalDofs[threadIndexInWarp]; i++)
		artiDofs[i].mDeferredQstZ[threadIndexInWarp] = 0.0f;

	//Now pick out the dof associated with joint B.
	const PxReal r = jointVelocity[dofB];
	return r;
}

static  __device__ void setupInternalMimicJointConstraints
(const PxgArticulationBlockData& artiBlock,
 const PxgArticulationBlockLinkData* PX_RESTRICT artiLinks,
 const PxgArticulationBitFieldData* PX_RESTRICT artiPathToRootBitFields, const PxU32 artiPathToRootBitFieldWordCount,
 PxgArticulationBlockDofData* PX_RESTRICT artiDofs,
 PxgArticulationBlockMimicJointData* PX_RESTRICT mimicJoints,
 const PxU32 threadIndexInWarp)
{
	const PxU32 nbMimicJoints = artiBlock.mNumMimicJoints[threadIndexInWarp];
	for(PxU32 i = 0; i < nbMimicJoints; i++)
	{
		//The coupled joints are the inbound joints of link0 and link1.
		const PxU32 linkA = mimicJoints[i].mLinkA[threadIndexInWarp];
		const PxU32 linkB = mimicJoints[i].mLinkB[threadIndexInWarp];

		//Store dofA and dofB
		const PxU32 axisA = mimicJoints[i].mAxisA[threadIndexInWarp];
		const PxU32 axisB = mimicJoints[i].mAxisB[threadIndexInWarp];
		const PxU32 dofA = artiLinks[linkA].mInvDofIds[axisA][threadIndexInWarp];
		const PxU32 dofB = artiLinks[linkB].mInvDofIds[axisB][threadIndexInWarp];
		mimicJoints[i].mInternalData.mDofA[threadIndexInWarp] = dofA;
		mimicJoints[i].mInternalData.mDofB[threadIndexInWarp] = dofB;

		//Compute all 4 response terms.
		const PxReal rAA = computeMimicJointSelfResponse(linkA, dofA, artiLinks, artiDofs, threadIndexInWarp);
		const PxReal rBB = computeMimicJointSelfResponse(linkB, dofB, artiLinks, artiDofs, threadIndexInWarp);
		const PxReal rBA = computeMimicJointCrossResponse(
			linkA, dofA, linkB, dofB,
			artiBlock, artiLinks, artiDofs, artiPathToRootBitFields, artiPathToRootBitFieldWordCount,
			threadIndexInWarp);
		const PxReal rAB = computeMimicJointCrossResponse(
			linkB, dofB, linkA, dofA,
			artiBlock, artiLinks, artiDofs, artiPathToRootBitFields, artiPathToRootBitFieldWordCount,
			threadIndexInWarp);

		const PxReal gearRatio = mimicJoints[i].mGearRatio[threadIndexInWarp];
		mimicJoints[i].mInternalData.recipEffectiveInertia[threadIndexInWarp] = computeRecipMimicJointEffectiveInertia(rAA, rAB, rBB, rBA, gearRatio);
	}
}


static __device__ void getImpulseSelfResponseSlow(
	const PxU32 linkID0_,
	const PxU32 linkID1_,
	const Cm::UnAlignedSpatialVector& impulse0,
	const Cm::UnAlignedSpatialVector& impulse1,
	Cm::UnAlignedSpatialVector& deltaV0,
	Cm::UnAlignedSpatialVector& deltaV1,
	const PxgArticulationBlockData& artiBlock,
	const PxgArticulationBlockLinkData* linkData,
	PxgArticulationBlockDofData* dofData,
	const PxU32 threadIndexInWarp)
{

	PxU32 stack[DY_ARTICULATION_TENDON_MAX_SIZE];

	PxU32 linkID0 = linkID0_;
	PxU32 linkID1 = linkID1_;

	PxU32 i0, i1;

	for (i0 = linkID0, i1 = linkID1; i0 != i1;)	// find common path
	{
		if (i0 < i1)
			i1 = linkData[i1].mParents[threadIndexInWarp];
		else
			i0 = linkData[i0].mParents[threadIndexInWarp];
	}

	PxU32 common = i0;

	Cm::UnAlignedSpatialVector Z0(-impulse0.top, -impulse0.bottom);
	Cm::UnAlignedSpatialVector Z1(-impulse1.top, -impulse1.bottom);

	//initialize tmp qstz to be zero
	const PxU32 numLinks = artiBlock.mNumLinks[threadIndexInWarp];

	for (PxU32 i = 0; i < numLinks; ++i)
	{
		const PxU32 jointOffset = linkData[i].mJointOffset[threadIndexInWarp];
		const PxU32 dofCount = linkData[i].mDofs[threadIndexInWarp];
		PxgArticulationBlockDofData* curDofData = &dofData[jointOffset];
		for (PxU32 j = 0; j < dofCount; ++j)
		{
			curDofData[j].mTmpQstZ[threadIndexInWarp] = 0.f;
		}
	}

	for (i0 = 0; linkID0 != common; linkID0 = linkData[linkID0].mParents[threadIndexInWarp])
	{

		const PxU32 jointOffset = linkData[linkID0].mJointOffset[threadIndexInWarp];
		const PxU32 dofCount = linkData[linkID0].mDofs[threadIndexInWarp];

		const float rwx = linkData[linkID0].mRw_x[threadIndexInWarp];
		const float rwy = linkData[linkID0].mRw_y[threadIndexInWarp];
		const float rwz = linkData[linkID0].mRw_z[threadIndexInWarp];

		const PxVec3 childToParent(rwx, rwy, rwz);

		Z0 = propagateImpulseWTemp(childToParent, dofData + jointOffset, Z0, dofCount, threadIndexInWarp);

		stack[i0++] = linkID0;
	}

	for (i1 = i0; linkID1 != common; linkID1 = linkData[linkID1].mParents[threadIndexInWarp])
	{
		const PxU32 jointOffset = linkData[linkID1].mJointOffset[threadIndexInWarp];
		const PxU32 dofCount = linkData[linkID1].mDofs[threadIndexInWarp];

		const float rwx = linkData[linkID1].mRw_x[threadIndexInWarp];
		const float rwy = linkData[linkID1].mRw_y[threadIndexInWarp];
		const float rwz = linkData[linkID1].mRw_z[threadIndexInWarp];

		const PxVec3 childToParent(rwx, rwy, rwz);

		Z1 = propagateImpulseWTemp(childToParent, dofData + jointOffset, Z1, dofCount, threadIndexInWarp);

		stack[i1++] = linkID1;
	}

	Cm::UnAlignedSpatialVector ZZ = Z0 + Z1;

	PxSpatialMatrix spatialResponse;

	loadSpatialMatrix(linkData[common].mSpatialResponseMatrix, threadIndexInWarp, spatialResponse);

	const Cm::UnAlignedSpatialVector v = (spatialResponse * (-ZZ));

	Cm::UnAlignedSpatialVector dv1 = v;
	for (PxU32 index = i1; (index--) > i0;)
	{
		//Dy::ArticulationLinkData& tLinkDatum = data.getLinkData(stack[index]);
		const PxU32 id = stack[index];
		const PxU32 jointOffset = linkData[id].mJointOffset[threadIndexInWarp];
		const PxU32 dofCount = linkData[id].mDofs[threadIndexInWarp];

		const float rwx = linkData[id].mRw_x[threadIndexInWarp];
		const float rwy = linkData[id].mRw_y[threadIndexInWarp];
		const float rwz = linkData[id].mRw_z[threadIndexInWarp];

		const PxVec3 childToParent(rwx, rwy, rwz);

		dv1 = propagateAccelerationWTemp(childToParent, dofData + jointOffset, dv1, dofCount, threadIndexInWarp);
	}

	Cm::UnAlignedSpatialVector dv0 = v;
	for (PxU32 index = i0; (index--) > 0;)
	{
		const PxU32 id = stack[index];
		const PxU32 jointOffset = linkData[id].mJointOffset[threadIndexInWarp];
		const PxU32 dofCount = linkData[id].mDofs[threadIndexInWarp];

		const float rwx = linkData[id].mRw_x[threadIndexInWarp];
		const float rwy = linkData[id].mRw_y[threadIndexInWarp];
		const float rwz = linkData[id].mRw_z[threadIndexInWarp];

		const PxVec3 childToParent(rwx, rwy, rwz);

		dv0 = propagateAccelerationWTemp(childToParent, dofData + jointOffset, dv0, dofCount, threadIndexInWarp);
	}

	deltaV0.bottom = dv0.bottom;
	deltaV0.top = dv0.top;

	deltaV1.bottom = dv1.bottom;
	deltaV1.top = dv1.top;

}

static __device__ void getImpulseSelfResponse(
	const PxU32 linkId0,
	const PxU32 linkId1,
	PxgArticulationBlockLinkData& link0,
	PxgArticulationBlockLinkData& link1,
	const Cm::UnAlignedSpatialVector& impulse0,
	const Cm::UnAlignedSpatialVector& impulse1,
	Cm::UnAlignedSpatialVector& deltaV0,
	Cm::UnAlignedSpatialVector& deltaV1,
	const PxgArticulationBlockData& artiBlock,
	const PxgArticulationBlockLinkData* linkData,
	PxgArticulationBlockDofData* dofData,
	const PxU32 threadIndexInWarp)
{
	if (link1.mParents[threadIndexInWarp] == linkId0)
	{
		PxSpatialMatrix parentSpatialResponse;

		loadSpatialMatrix(link0.mSpatialResponseMatrix, threadIndexInWarp, parentSpatialResponse);

		const PxU32 jointOffset = link1.mJointOffset[threadIndexInWarp];
		const PxU32 dofCount = link1.mDofs[threadIndexInWarp];

		const float rwx = link1.mRw_x[threadIndexInWarp];
		const float rwy = link1.mRw_y[threadIndexInWarp];
		const float rwz = link1.mRw_z[threadIndexInWarp];

		const PxVec3 childToParent(rwx, rwy, rwz);

		getImpulseSelfResponse(impulse0, impulse1, deltaV0, deltaV1, dofData + jointOffset, childToParent, parentSpatialResponse, dofCount, threadIndexInWarp);
	}
	else
	{
		getImpulseSelfResponseSlow(linkId0, linkId1, impulse0, impulse1, deltaV0, deltaV1, artiBlock, linkData, dofData, threadIndexInWarp);
	}
}

static __device__ void setupInternalSpatialTendonConstraints(
	PxgArticulationBlockData& artiBlock,
	PxgArticulationBlockLinkData* PX_RESTRICT artiLinks,
	PxgArticulationBlockDofData* PX_RESTRICT artiDofs,
	PxgArticulationBlockSpatialTendonData* PX_RESTRICT artiTendons,
	PxgArticulationInternalTendonConstraintData* PX_RESTRICT artiTendonConstraints,
	PxgArticulationBlockAttachmentData* PX_RESTRICT artiAttachments,
	const PxU32 maxAttachments, const PxReal stepDt, const PxReal dt,
	const PxReal invDt, bool isTGSSolver, const PxU32 threadIndexInWarp)
{
	const PxU32 numTendons = artiBlock.mNumSpatialTendons[threadIndexInWarp];


	PxReal accumLength[DY_ARTICULATION_TENDON_MAX_SIZE];

	for (PxU32 i = 0; i < numTendons; ++i)
	{
		PxgArticulationBlockSpatialTendonData& tendonData = artiTendons[i];
		const PxReal stiffness = tendonData.mStiffness[threadIndexInWarp];
		const PxReal damping = tendonData.mDamping[threadIndexInWarp];
		const PxReal limitStiffness = tendonData.mLimitStiffness[threadIndexInWarp];
		const PxReal offset = tendonData.mOffset[threadIndexInWarp];

		PxgArticulationBlockAttachmentData* attachmentBlock = &artiAttachments[i * maxAttachments];
		PxgArticulationInternalTendonConstraintData* constraintBlock = &artiTendonConstraints[i * maxAttachments];

		const PxReal coefficient = attachmentBlock[0].mCoefficient[threadIndexInWarp];
		PxU64 bitStack = attachmentBlock[0].mChildrens[threadIndexInWarp];
		PxU32 stackCount = __popcll(bitStack);

		PxU32 parent = 0;

		PxU32 numConstraints = 0;

		accumLength[parent] = offset * coefficient;

		PxgArticulationBlockAttachmentData& rAttachmentData = attachmentBlock[parent];
		const PxU32 rAttachmentLinkIndex = rAttachmentData.mLinkIndex[threadIndexInWarp];
		PxgArticulationBlockLinkData& rAttachmentLink = artiLinks[rAttachmentLinkIndex];
		const PxTransform rAttachmentLinkBody2World = loadSpatialTransform(rAttachmentLink.mAccumulatedPose, threadIndexInWarp);
		const PxVec3 rRa = rAttachmentLinkBody2World.q.rotate(rAttachmentData.mRelativeOffset[threadIndexInWarp]);
		const PxVec3 rAttachPoint = rAttachmentLinkBody2World.p + rRa;
		PxVec3 rAxis, rRaXn;

		PxU32 child = 63 - __clzll(bitStack);

		while (stackCount != 0)
		{
			stackCount--;

			PxgArticulationBlockAttachmentData& attachmentData = attachmentBlock[child];

			const PxU32 linkInd = attachmentData.mLinkIndex[threadIndexInWarp];
			PxgArticulationBlockLinkData& cLink = artiLinks[linkInd];

			const PxReal cfm = cLink.mCfm[threadIndexInWarp];

			const PxTransform cBody2World = loadSpatialTransform(cLink.mAccumulatedPose, threadIndexInWarp);
			const PxVec3 rb = cBody2World.q.rotate(attachmentData.mRelativeOffset[threadIndexInWarp]);
			const PxVec3 cAttachPoint = cBody2World.p + rb;


			PxReal distance = 0.f;
			PxVec3 dif(0.f);
			//if the current attachment's parent is the root, we need to compute root axis and root raXn
			if (parent == 0)
			{
				dif = rAttachPoint - cAttachPoint;
				const PxReal distanceSq = dif.magnitudeSquared();
				distance = PxSqrt(distanceSq);

				rAxis = distance > 0.001f ? (dif / distance) : PxVec3(0.f);
				rRaXn = rRa.cross(rAxis);

			}
			else
			{
				PxgArticulationBlockAttachmentData& pAttachmentData = attachmentBlock[parent];
				const PxU32 pLinkInd = pAttachmentData.mLinkIndex[threadIndexInWarp];
				PxgArticulationBlockLinkData& pLink = artiLinks[pLinkInd];

				const PxTransform pBody2World = loadSpatialTransform(pLink.mAccumulatedPose, threadIndexInWarp);

				const PxVec3 ra = pBody2World.q.rotate(pAttachmentData.mRelativeOffset[threadIndexInWarp]);

				const PxVec3 pAttachPoint = pBody2World.p + ra;

				dif = pAttachPoint - cAttachPoint;
				const PxReal distanceSq = dif.magnitudeSquared();
				distance = PxSqrt(distanceSq);
			}

			const PxReal u = distance * attachmentData.mCoefficient[threadIndexInWarp] + accumLength[parent];

			PxU64 children = attachmentData.mChildrens[threadIndexInWarp];

			if (children)
			{
				const PxU32 numChildrens = __popcll(children);
				stackCount += numChildrens;

				accumLength[child] = u;
			}
			else
			{


				const PxVec3 axis = distance > 0.001f ? (dif / distance) : PxVec3(0.f);

				const PxVec3 rbXn = rb.cross(axis);

				Cm::UnAlignedSpatialVector axis0(rAxis, rRaXn);
				Cm::UnAlignedSpatialVector axis1(-axis, -rbXn);

				Cm::UnAlignedSpatialVector deltaV0, deltaV1;

				getImpulseSelfResponse(rAttachmentLinkIndex, linkInd, rAttachmentLink, cLink, axis0, axis1,
					deltaV0, deltaV1, artiBlock, artiLinks, artiDofs, threadIndexInWarp);


				const PxReal r0 = deltaV0.bottom.dot(rAxis) + deltaV0.top.dot(rRaXn);
				const PxReal r1 = deltaV1.bottom.dot(axis) + deltaV1.top.dot(rbXn);

				const PxReal unitResponse = r0 - r1;

				const PxReal recipResponse = unitResponse > DY_ARTICULATION_MIN_RESPONSE ? (1.0f / (unitResponse + cfm))  : 0.0f;

				PxgArticulationInternalTendonConstraintData& constraint = constraintBlock[numConstraints++];

				//constraint.mDeltaVA[threadIndexInWarp] = r0;
				//storeSpatialVector(constraint.mDeltaVA, deltaV0, threadIndexInWarp);
				//storeSpatialVector(constraint.mDeltaVB, deltaV1, threadIndexInWarp);
				storeSpatialVector(constraint.mRow0, axis0, threadIndexInWarp);
				storeSpatialVector(constraint.mRow1, -axis1, threadIndexInWarp);

				//storeSpatialVector(constraint.mDeltaVB, deltaV1, threadIndexInWarp);
				constraint.mDeltaVA[threadIndexInWarp] = r0;

				//constraint.mResponse[threadIndexInWarp] = unitResponse;
				constraint.mRecipResponse[threadIndexInWarp] = recipResponse;

				const PxReal a = stepDt * (stepDt*stiffness + damping);

				const PxReal a2 = stepDt * (stepDt* limitStiffness + damping);

				const PxReal x = unitResponse > 0.f ? 1.0f / (1.0f + a * unitResponse) : 0.f;

				const PxReal x2 = unitResponse > 0.f ? 1.0f / (1.0f + a2 * unitResponse) : 0.f;

				constraint.mVelMultiplier[threadIndexInWarp] = -x * a;
				constraint.mImpulseMultiplier[threadIndexInWarp] = isTGSSolver ? 1.f : 1.f - x;
				constraint.mBiasCoefficient[threadIndexInWarp] = (-stiffness * x * stepDt);
				constraint.mAppliedForce[threadIndexInWarp] = 0.f;

				constraint.mAccumulatedLength[threadIndexInWarp] = u;
				constraint.mLink0[threadIndexInWarp] = rAttachmentLinkIndex;
				constraint.mLink1[threadIndexInWarp] = linkInd;

				constraint.mLimitImpulseMultiplier[threadIndexInWarp] = isTGSSolver ? 1.f : 1.f - x2;
				constraint.mLimitBiasCoefficient[threadIndexInWarp] = (-limitStiffness * x2 * stepDt);
				constraint.mLimitAppliedForce[threadIndexInWarp] = 0.f;

				constraint.mRestDistance[threadIndexInWarp] = attachmentData.mRestDistance[threadIndexInWarp];
				constraint.mLowLimit[threadIndexInWarp] = attachmentData.mLowLimit[threadIndexInWarp];
				constraint.mHighLimit[threadIndexInWarp] = attachmentData.mHighLimit[threadIndexInWarp];
			}

			if (stackCount > 0)
			{
				//clear child
				bitStack &= (~(1ull << child));

				//add on children to the stack
				bitStack |= children;

				//pop up the next child from stack
				child = 63 - __clzll(bitStack);

				//assign the parent with next child's parent
				parent = attachmentBlock[child].mParents[threadIndexInWarp];
			}

		}

		tendonData.mNumConstraints[threadIndexInWarp] = numConstraints;

	}
}


static __device__ void setupInternalFixedTendonConstraints(
	PxgArticulationBlockData& artiBlock,
	PxgArticulationBlockLinkData* PX_RESTRICT artiLinks,
	PxgArticulationBlockDofData* PX_RESTRICT artiDofs,
	PxgArticulationBlockFixedTendonData* PX_RESTRICT artiTendons,
	PxgArticulationInternalTendonConstraintData* PX_RESTRICT artiTendonConstraints,
	PxgArticulationBlockTendonJointData* PX_RESTRICT artiTendonJoints,
	const PxU32 maxTendonJoints, const PxReal stepDt, const PxReal dt,
	const PxReal invDt, bool isTGSSolver, const PxU32 threadIndexInWarp)
{
	const PxU32 numTendons = artiBlock.mNumFixedTendons[threadIndexInWarp];

	for (PxU32 i = 0; i < numTendons; ++i)
	{
		PxgArticulationBlockFixedTendonData& tendonData = artiTendons[i];

		const PxReal stiffness = tendonData.mStiffness[threadIndexInWarp];
		const PxReal damping = tendonData.mDamping[threadIndexInWarp];
		const PxReal limitStiffness = tendonData.mLimitStiffness[threadIndexInWarp];

		PxgArticulationBlockTendonJointData* tendonJointBlock = &artiTendonJoints[i * maxTendonJoints];
		PxgArticulationInternalTendonConstraintData* constraintBlock = &artiTendonConstraints[i * maxTendonJoints];

		PxU64 bitStack = tendonJointBlock[0].mChildrens[threadIndexInWarp];

		PxU32 stackCount = __popcll(bitStack);

		PxU32 parent = 0;

		PxU32 numConstraints = 0;

		PxgArticulationBlockTendonJointData& sTendonJointData = tendonJointBlock[parent];
		const PxU32 sLinkIndex = sTendonJointData.mLinkIndex[threadIndexInWarp];
		PxgArticulationBlockLinkData& sLink = artiLinks[sLinkIndex];
		const PxTransform sBody2World = loadSpatialTransform(sLink.mAccumulatedPose, threadIndexInWarp);

		PxVec3 sAxis;
		PxVec3 sRaXn;


		PxU32 child = 63 - __clzll(bitStack);

		while (stackCount != 0)
		{
			stackCount--;

			PxgArticulationBlockTendonJointData& tjData = tendonJointBlock[child];

			const PxU32 tjAxis = tjData.mAxis[threadIndexInWarp];

			const PxU32 cLinkInd = tjData.mLinkIndex[threadIndexInWarp];
			PxgArticulationBlockLinkData& cLink = artiLinks[cLinkInd];

			const PxU32 parentLink = cLink.mParents[threadIndexInWarp];
			const PxReal cfm = PxMax(artiLinks[parentLink].mCfm[threadIndexInWarp], cLink.mCfm[threadIndexInWarp]);

			const PxU32 jointOffset = cLink.mJointOffset[threadIndexInWarp];
			const PxU8 dofIndex = cLink.mInvDofIds[tjAxis][threadIndexInWarp];
			PxgArticulationBlockDofData& dofData = artiDofs[jointOffset + dofIndex];

			const Cm::UnAlignedSpatialVector worldMotionVector = loadSpatialVector(dofData.mWorldMotionMatrix, threadIndexInWarp);

			//if the current tendon joint's parent is the root, we need to compute root axis and root raXn
			if (parent == 0)
			{

				if (tjAxis < PxArticulationAxis::eX)
				{

					sAxis = PxVec3(0.f);
					sRaXn = worldMotionVector.top;
				}
				else
				{
					const float4 p = cLink.mParentPose.p[threadIndexInWarp];
					const PxQuat q = reinterpret_cast<PxQuat&>(cLink.mParentPose.q[threadIndexInWarp]);
					const PxTransform parentPose(PxVec3(p.x, p.y, p.z), q);
					const PxTransform cA2w = sBody2World.transform(parentPose);
					const PxVec3 ang0 = (cA2w.p - sBody2World.p).cross(worldMotionVector.bottom);
					sAxis = worldMotionVector.bottom;
					sRaXn = ang0;
				}
			}

			PxU64 children = tjData.mChildrens[threadIndexInWarp];

			PxVec3 axis, rbXn;
			if (tjAxis < PxArticulationAxis::eX)
			{
				axis = PxVec3(0.f);
				rbXn = worldMotionVector.top;
			}
			else
			{
				const PxTransform cBody2World = loadSpatialTransform(cLink.mAccumulatedPose, threadIndexInWarp);

				const float4 p = cLink.mChildPose.p[threadIndexInWarp];
				const PxQuat q = reinterpret_cast<PxQuat&>(cLink.mChildPose.q[threadIndexInWarp]);
				const PxTransform childPose(PxVec3(p.x, p.y, p.z), q);
				const PxTransform cB2w = cBody2World.transform(childPose);

				const PxVec3 tAxis = worldMotionVector.bottom;
				axis = tAxis;
				rbXn = (cB2w.p - cBody2World.p).cross(axis);
			}


			//create constraint
			Cm::UnAlignedSpatialVector axis0(sAxis, sRaXn);
			Cm::UnAlignedSpatialVector axis1(-axis, -rbXn);

			Cm::UnAlignedSpatialVector deltaV0, deltaV1;

			getImpulseSelfResponse(sLinkIndex, cLinkInd, sLink, cLink, axis0, axis1,
				deltaV0, deltaV1, artiBlock, artiLinks, artiDofs, threadIndexInWarp);

			const PxReal r0 = deltaV0.bottom.dot(sAxis) + deltaV0.top.dot(sRaXn);
			const PxReal r1 = deltaV1.bottom.dot(axis) + deltaV1.top.dot(rbXn);

			const PxReal unitResponse = r0 - r1;

			const PxReal recipResponse = 1.0f / (unitResponse + cfm);

			PxgArticulationInternalTendonConstraintData& constraint = constraintBlock[numConstraints];

			constraint.mDeltaVA[threadIndexInWarp] = r0;
			storeSpatialVector(constraint.mDeltaVB, deltaV1, threadIndexInWarp);
			storeSpatialVector(constraint.mRow0, axis0, threadIndexInWarp);
			storeSpatialVector(constraint.mRow1, -axis1, threadIndexInWarp);

			constraint.mRecipResponse[threadIndexInWarp] = recipResponse;

			const PxReal a = stepDt * (stepDt*stiffness + damping);

			const PxReal a2 = stepDt * (stepDt*limitStiffness + damping);

			const PxReal x = unitResponse > 0.f ? 1.0f / (1.0f + a * unitResponse) : 0.f;

			const PxReal x2 = unitResponse > 0.f ? 1.0f / (1.0f + a2 * unitResponse) : 0.f;

			constraint.mVelMultiplier[threadIndexInWarp] = -x * a;
			constraint.mImpulseMultiplier[threadIndexInWarp] = isTGSSolver ? 1.f : 1.f - x;
			constraint.mBiasCoefficient[threadIndexInWarp] = (-stiffness * x * stepDt);
			constraint.mAppliedForce[threadIndexInWarp] = 0.f;

			constraint.mLink0[threadIndexInWarp] = sLinkIndex;
			constraint.mLink1[threadIndexInWarp] = cLinkInd;

			constraint.mLimitImpulseMultiplier[threadIndexInWarp] = isTGSSolver ? 1.f : 1.f - x2;
			constraint.mLimitBiasCoefficient[threadIndexInWarp] = (-limitStiffness * x2 * stepDt);
			constraint.mLimitAppliedForce[threadIndexInWarp] = 0.f;


			//assign constraint index to tendon joint data
			tjData.mConstraintId[threadIndexInWarp] = numConstraints;
			numConstraints++;

			if (children)
			{
				const PxU32 numChildrens = __popcll(children);
				stackCount += numChildrens;
			}

			if (stackCount > 0)
			{
				//clear child
				bitStack &= (~(1ull << child));

				//add on children to the stack
				bitStack |= children;

				//pop up the next child from stack
				child = 63 - __clzll(bitStack);

				//assign the parent with next child's parent
				parent = tendonJointBlock[child].mParents[threadIndexInWarp];
			}

		}

		tendonData.mNumConstraints[threadIndexInWarp] = numConstraints;

	}
}

extern "C" __global__ void setupInternalConstraintLaunch1T(
	PxgArticulationCoreDesc* scDesc, const PxReal stepDt, const PxReal dt,
	const PxReal invDt, bool isTGSSolver)
{
	const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE - 1);
	const PxU32 warpIndex = threadIdx.y;

	const PxU32 nbArticulations = scDesc->nbArticulations;

	const PxU32 globalWarpIndex = blockIdx.x * blockDim.y + warpIndex;
	const PxU32 globalThreadIndex = globalWarpIndex * WARP_SIZE + threadIndexInWarp;


	if (globalThreadIndex < nbArticulations)
	{
		const PxU32 maxLinks = scDesc->mMaxLinksPerArticulation;
		const PxU32 maxDofs = scDesc->mMaxDofsPerArticulation;


		PxgArticulationBlockData& artiBlock = scDesc->mArticulationBlocks[globalWarpIndex];
		PxgArticulationBlockLinkData* artiLinks = &scDesc->mArticulationLinkBlocks[globalWarpIndex * maxLinks];
		PxgArticulationBlockDofData* artiDofs = &scDesc->mArticulationDofBlocks[globalWarpIndex * maxDofs];

		{
			setupInternalConstraints(artiBlock, artiLinks, artiDofs, stepDt, dt, invDt, isTGSSolver, threadIndexInWarp);
		}

		{
			const PxU32 maxSpatialTendons = scDesc->mMaxSpatialTendonsPerArticulation;
			const PxU32 maxAttachments = scDesc->mMaxAttachmentPerArticulation;

			PxgArticulationBlockSpatialTendonData* artiSpatialTendons = &scDesc->mArticulationSpatialTendonBlocks[globalWarpIndex * maxSpatialTendons];
			PxgArticulationInternalTendonConstraintData* artiSpatialTendonConstraints = &scDesc->mArticulationSpatialTendonConstraintBlocks[globalWarpIndex * maxSpatialTendons * maxAttachments];
			PxgArticulationBlockAttachmentData* artiAttachments = &scDesc->mArticulationAttachmentBlocks[globalWarpIndex * maxSpatialTendons * maxAttachments];
			setupInternalSpatialTendonConstraints(artiBlock, artiLinks, artiDofs, artiSpatialTendons, artiSpatialTendonConstraints, artiAttachments, maxAttachments, stepDt, dt, invDt, isTGSSolver, threadIndexInWarp);
		}

		{
			const PxU32 maxFixedTendons = scDesc->mMaxFixedTendonsPerArticulation;
			const PxU32 maxTendonJoints = scDesc->mMaxTendonJointPerArticulation;

			PxgArticulationBlockFixedTendonData* artiFixedTendons = &scDesc->mArticulationFixedTendonBlocks[globalWarpIndex * maxFixedTendons];
			PxgArticulationInternalTendonConstraintData* artiFixedTendonConstraints = &scDesc->mArticulationFixedTendonConstraintBlocks[globalWarpIndex * maxFixedTendons * maxTendonJoints];
			PxgArticulationBlockTendonJointData* artiTendonJoints = &scDesc->mArticulationTendonJointBlocks[globalWarpIndex * maxFixedTendons * maxTendonJoints];
			setupInternalFixedTendonConstraints(artiBlock, artiLinks, artiDofs, artiFixedTendons, artiFixedTendonConstraints, artiTendonJoints, maxTendonJoints, stepDt, dt, invDt, isTGSSolver, threadIndexInWarp);
		}

		{
			//See comment accompanying mPathToRootBitFieldBlocks declaration for a quick reminder of the indexing of mPathToRootBitFieldBlocks.
			const PxU32 artiPathToRootBitFieldWordCount = (maxLinks + 63) / 64;
			PxgArticulationBitFieldData* artiPathToRootBitFields = &scDesc->mPathToRootBitFieldBlocks[globalWarpIndex * maxLinks * artiPathToRootBitFieldWordCount];

			//Get the mimic joints for this articulation.
			const PxU32 maxMimicJoints = scDesc->mMaxMimicJointsPerArticulation;
			PxgArticulationBlockMimicJointData* artiMimicJoints = &scDesc->mArticulationMimicJointBlocks[globalWarpIndex * maxMimicJoints];

			setupInternalMimicJointConstraints(
				artiBlock,
				artiLinks,
				artiPathToRootBitFields, artiPathToRootBitFieldWordCount,
				artiDofs,
				artiMimicJoints,
				threadIndexInWarp);
		}
	}
}

//This is for PGS solver
static __device__ PX_FORCE_INLINE void solveStaticConstraints(PxgArticulationCoreDesc* PX_RESTRICT scDesc, PxgArticulationBlockLinkData& PX_RESTRICT data,
	const PxgSolverSharedDesc<IterativeSolveData>* const PX_RESTRICT sharedDesc,
	Cm::UnAlignedSpatialVector& PX_RESTRICT vel, Cm::UnAlignedSpatialVector& PX_RESTRICT impulse, Cm::UnAlignedSpatialVector& PX_RESTRICT deltaV, PxU32 threadIndexInWarp, bool doFriction,
	PxReal /*minPen*/, PxReal /*elapsedTime*/, PxU32 linkID, PxU32 constraintCounts, PxgErrorAccumulator* PX_RESTRICT error)
{


	const IterativeSolveData& iterativeData = sharedDesc->iterativeData;

	const PxU32 constraintBatchOffset = data.mStaticJointStartIndex[threadIndexInWarp];
	const PxU32 contactBatchOffset = data.mStaticContactStartIndex[threadIndexInWarp];
	const PxU32 contactCounts = data.mNbStaticContacts[threadIndexInWarp];

	Cm::UnAlignedSpatialVector oldVel = vel;
	Cm::UnAlignedSpatialVector vel0, vel1;

	for (PxU32 i = 0; i < constraintCounts; ++i)
	{
		const PxgBlockConstraintBatch& batch = iterativeData.blockConstraintBatch[constraintBatchOffset + i];


		PxU32 mask = batch.mask;

		PxU32 offset = warpScanExclusive(mask, threadIndexInWarp);

		const PxNodeIndex igNodeIndexA = batch.bodyANodeIndex[offset];

		if (igNodeIndexA.isArticulation())
		{
			vel0 = vel;
			vel1 = Cm::UnAlignedSpatialVector::Zero();
		}
		else
		{
			assert(batch.bodyBNodeIndex[offset].isArticulation());
			vel0 = Cm::UnAlignedSpatialVector::Zero();
			vel1 = vel;
		}

		PxgArticulationBlockResponse* responses = iterativeData.artiResponse;
		const PxU32 responseIndex = batch.mArticulationResponseIndex;


		Cm::UnAlignedSpatialVector impulse0 = Cm::UnAlignedSpatialVector::Zero();
		Cm::UnAlignedSpatialVector impulse1 = Cm::UnAlignedSpatialVector::Zero();

		{
			assert(batch.constraintType == PxgSolverConstraintDesc::eARTICULATION_CONSTRAINT_1D);

			// For interaction with static objects, mass-splitting is not used; thus, reference counts are 1 (default).
			solveExt1DBlock(batch, vel0, vel1, offset, iterativeData.blockJointConstraintHeaders,
				iterativeData.blockJointConstraintRowsCon, iterativeData.blockJointConstraintRowsMod,
				&responses[responseIndex], impulse0, impulse1, scDesc->mContactErrorAccumulator.mCounter >= 0);
		}

		if (igNodeIndexA.isArticulation())
		{
			impulse += impulse0;
			vel = vel0;
		}
		else
		{
			assert(batch.bodyBNodeIndex[offset].isArticulation());
			impulse += impulse1;
			vel = vel1;
		}

	}

	for (PxU32 i = 0; i < contactCounts; ++i)
	{

		const PxgBlockConstraintBatch& batch = iterativeData.blockConstraintBatch[contactBatchOffset + i];

		PxU32 mask = batch.mask;

		PxU32 offset = warpScanExclusive(mask, threadIndexInWarp);

		const PxNodeIndex igNodeIndexA = batch.bodyANodeIndex[offset];

		if (igNodeIndexA.isArticulation())
		{
			vel0 = vel;
			vel1 = Cm::UnAlignedSpatialVector::Zero();
		}
		else
		{
			assert(batch.bodyBNodeIndex[offset].isArticulation());
			vel0 = Cm::UnAlignedSpatialVector::Zero();
			vel1 = vel;
		}


		Cm::UnAlignedSpatialVector impulse0 = Cm::UnAlignedSpatialVector::Zero();
		Cm::UnAlignedSpatialVector impulse1 = Cm::UnAlignedSpatialVector::Zero();

		{
			assert(batch.constraintType == PxgSolverConstraintDesc::eARTICULATION_CONTACT);

			// For interaction with static objects, mass-splitting is not used; thus, reference counts are 1 (default).
			solveExtContactsBlock(batch, vel0, vel1, doFriction, iterativeData.blockContactHeaders,
				iterativeData.blockFrictionHeaders, iterativeData.blockContactPoints,
				iterativeData.blockFrictions, iterativeData.artiResponse, impulse0,
				impulse1, offset, error);
		}

		if (igNodeIndexA.isArticulation())
		{
			impulse += impulse0;
			vel = vel0;
		}
		else
		{
			assert(batch.bodyBNodeIndex[offset].isArticulation());
			impulse += impulse1;
			vel = vel1;
		}
	}

	if ((constraintCounts + contactCounts) > 0)
	{
		deltaV += vel - oldVel;
	}
}

//This is for TGS solver
static __device__ PX_FORCE_INLINE void solveStaticConstraints(PxgArticulationCoreDesc* PX_RESTRICT scDesc, const PxgArticulationBlockLinkData& PX_RESTRICT data,
	const PxgSolverSharedDesc<IterativeSolveDataTGS>* const PX_RESTRICT sharedDesc,
	Cm::UnAlignedSpatialVector& PX_RESTRICT vel, Cm::UnAlignedSpatialVector& PX_RESTRICT impulse, Cm::UnAlignedSpatialVector& PX_RESTRICT deltaV, PxU32 threadIndexInWarp, bool doFriction,
	PxReal minPen, PxReal elapsedTime, PxU32 linkID, PxU32 constraintCounts, PxgErrorAccumulator* PX_RESTRICT error)
{

	const IterativeSolveDataTGS& iterativeData = sharedDesc->iterativeData;

	const PxU32 constraintBatchOffset = data.mStaticJointStartIndex[threadIndexInWarp];
	const PxU32 contactBatchOffset = data.mStaticContactStartIndex[threadIndexInWarp];
	const PxU32 contactCounts = data.mNbStaticContacts[threadIndexInWarp];

	Cm::UnAlignedSpatialVector oldVel = vel;

	Cm::UnAlignedSpatialVector delta = loadSpatialVector(data.mDeltaMotion, threadIndexInWarp);

	PxQuat deltaQ;
	if(constraintCounts > 0)
		deltaQ = loadQuat(data.mDeltaQ, threadIndexInWarp);

	Cm::UnAlignedSpatialVector vel0, vel1;
	Cm::UnAlignedSpatialVector delta0, delta1;
	PxQuat deltaQ0, deltaQ1;

	for (PxU32 i = 0; i < constraintCounts; ++i)
	{
		const PxgBlockConstraintBatch& batch = iterativeData.blockConstraintBatch[constraintBatchOffset + i];

		PxU32 mask = batch.mask;

		PxU32 offset = warpScanExclusive(mask, threadIndexInWarp);

		const PxNodeIndex igNodeIndexA = batch.bodyANodeIndex[offset];

		if (igNodeIndexA.isArticulation())
		{
			vel0 = vel;
			delta0 = delta;
			deltaQ0 = deltaQ;

			vel1 = Cm::UnAlignedSpatialVector::Zero();
			delta1 = Cm::UnAlignedSpatialVector::Zero();
			deltaQ1 = PxQuat(PxIdentity);

		}
		else
		{
			assert(batch.bodyBNodeIndex[offset].isArticulation());
			vel0 = Cm::UnAlignedSpatialVector::Zero();
			delta0 = Cm::UnAlignedSpatialVector::Zero();
			deltaQ0 = PxQuat(PxIdentity);

			vel1 = vel;
			delta1 = delta;
			deltaQ1 = deltaQ;
		}


		Cm::UnAlignedSpatialVector impulse0 = Cm::UnAlignedSpatialVector::Zero();
		Cm::UnAlignedSpatialVector impulse1 = Cm::UnAlignedSpatialVector::Zero();

		{
			assert(batch.constraintType == PxgSolverConstraintDesc::eARTICULATION_CONSTRAINT_1D);

			// For interaction with static objects, mass-splitting is not used; thus, reference counts are 1 (default).
			solveExt1DBlockTGS(batch, vel0, vel1, delta0, delta1, offset, iterativeData.blockJointConstraintHeaders,
				iterativeData.blockJointConstraintRowsCon, iterativeData.artiResponse, deltaQ0, deltaQ1, elapsedTime, impulse0, impulse1,
				scDesc->mContactErrorAccumulator.mCounter >= 0);

		}

		if (igNodeIndexA.isArticulation())
		{
			impulse += impulse0;
			vel = vel0;
			delta = delta0;
			deltaQ = deltaQ0;

		}
		else
		{
			assert(batch.bodyBNodeIndex[offset].isArticulation());
			impulse += impulse1;
			vel = vel1;
			delta = delta1;
			deltaQ = deltaQ1;
		}

	}

	for (PxU32 i = 0; i < contactCounts; ++i)
	{

		const PxgBlockConstraintBatch& batch = iterativeData.blockConstraintBatch[contactBatchOffset + i];

		PxU32 mask = batch.mask;

		PxU32 offset = warpScanExclusive(mask, threadIndexInWarp);

		const PxNodeIndex igNodeIndexA = batch.bodyANodeIndex[offset];

		if (igNodeIndexA.isArticulation())
		{
			vel0 = vel;
			delta0 = delta;
			deltaQ0 = deltaQ;

			vel1 = Cm::UnAlignedSpatialVector::Zero();
			delta1 = Cm::UnAlignedSpatialVector::Zero();
			deltaQ1 = PxQuat(PxIdentity);
		}
		else
		{
			assert(batch.bodyBNodeIndex[offset].isArticulation());
			vel0 = Cm::UnAlignedSpatialVector::Zero();
			delta0 = Cm::UnAlignedSpatialVector::Zero();
			deltaQ0 = PxQuat(PxIdentity);

			vel1 = vel;
			delta1 = delta;
			deltaQ1 = deltaQ;
		}

		Cm::UnAlignedSpatialVector impulse0 = Cm::UnAlignedSpatialVector::Zero();
		Cm::UnAlignedSpatialVector impulse1 = Cm::UnAlignedSpatialVector::Zero();

		{
			assert(batch.constraintType == PxgSolverConstraintDesc::eARTICULATION_CONTACT);

			// For interaction with static objects, mass-splitting is not used; thus, reference counts are 1 (default).
			solveExtContactBlockTGS(batch, vel0, vel1, delta0, delta1, offset,
				iterativeData.blockContactHeaders, iterativeData.blockFrictionHeaders, iterativeData.blockContactPoints,
				iterativeData.blockFrictions, iterativeData.artiResponse, elapsedTime, minPen, impulse0, impulse1, error);
		}

		if (igNodeIndexA.isArticulation())
		{
			impulse += impulse0;
			vel = vel0;
			delta = delta0;
			deltaQ = deltaQ0;
		}
		else
		{
			assert(batch.bodyBNodeIndex[offset].isArticulation());
			impulse += impulse1;
			vel = vel1;
			delta = delta1;
			deltaQ = deltaQ1;
		}

	}

	if ((constraintCounts + contactCounts) > 0)
	{
		deltaV += vel - oldVel;
	}
}

template <typename IterativeData, const bool isTGS, const bool residualReportingEnabled>
static __device__ void artiSolveInternalConstraints1T(PxgArticulationCoreDesc* PX_RESTRICT scDesc, const PxReal dt,
	const PxReal invDt, const PxReal elapsedTime, const bool isVelIter, const PxU32* const PX_RESTRICT staticContactUniqueIds,
	const PxU32* const PX_RESTRICT staticJointUniqueIds,
	const PxgSolverSharedDesc<IterativeData>* const PX_RESTRICT sharedDesc,
	const PxReal erp,
	bool doFriction, bool isExternalForceEveryStep )
{
	const PxU32 nbSlabs = scDesc->nbSlabs; // # articulation slabs
	const PxU32 nbArticulations = scDesc->nbArticulations;

	const PxU32 blockStride = blockDim.x;// / WARP_SIZE;

	//This identifies which warp a specific thread is in, we treat all warps in all blocks as a flatten warp array
	//and we are going to index the work based on that
	//const PxU32 warpIndex = threadIdx.y;
	const PxU32 globalThreadIndex = blockIdx.x * blockStride + threadIdx.x;

	const PxReal minPen = isVelIter ? 0.f : -PX_MAX_F32;

	PxgErrorAccumulator error;
	PxgErrorAccumulator contactError;

	if (globalThreadIndex < nbArticulations)
	{
		// use remap because of reinsertion of articulations
		const PxgBodySim* const PX_RESTRICT gBodySim = scDesc->mBodySimBufferDeviceData;
		const PxNodeIndex* const PX_RESTRICT gIslandNodeIndex = scDesc->islandNodeIndices;
		const PxU32 articulationOffset = scDesc->articulationOffset;
		const PxU32 nodeIndex = gIslandNodeIndex[globalThreadIndex + articulationOffset].index();
		const PxgBodySim& bodySim = gBodySim[nodeIndex];
		PxgArticulation& articulation = scDesc->articulations[bodySim.articulationRemapId];

		if(residualReportingEnabled)
		{
			articulation.internalResidualAccumulator.reset();
			articulation.contactResidualAccumulator.reset();
		}

		//KS - strong possiblity that nodeIndex and bodySim can be dropped because articId == globalWarpIndex!

		//Identify which block we are solving...

		PxgArticulationBlockLinkData* PX_RESTRICT data = scDesc->mArticulationLinkBlocks + scDesc->mMaxLinksPerArticulation * blockIdx.x;
		PxgArticulationBlockDofData* PX_RESTRICT dofData = scDesc->mArticulationDofBlocks + scDesc->mMaxDofsPerArticulation * blockIdx.x;
		PxgArticulationBlockData& blockData = scDesc->mArticulationBlocks[blockIdx.x];

		const PxU32 numLinks = blockData.mNumLinks[threadIdx.x];

		if (blockData.mStateDirty[threadIdx.x] & PxgArtiStateDirtyFlag::eHAS_IMPULSES)
		{
			averageLinkImpulsesAndPropagate(scDesc->slabHasChanges, scDesc->impulses, blockData, data, dofData, globalThreadIndex, scDesc->mMaxLinksPerArticulation,
				nbArticulations, nbSlabs, numLinks, threadIdx.x);
		}
		blockData.mStateDirty[threadIdx.x] = PxgArtiStateDirtyFlag::eVEL_DIRTY;

		const bool fixBase = blockData.mFlags[threadIdx.x] & PxArticulationFlag::eFIX_BASE;

		const Cm::UnAlignedSpatialVector rootDeferredZ = loadSpatialVector(blockData.mRootDeferredZ, threadIdx.x);

		Cm::UnAlignedSpatialVector parentDeltaV(PxVec3(0.f), PxVec3(0.f)), storeParentDeltaV(PxVec3(0.f), PxVec3(0.f));
		Cm::UnAlignedSpatialVector parentImp(PxVec3(0.f), PxVec3(0.f));


		if (!fixBase)
		{
			const PxU32 constraintCounts0 = data[0].mNbStaticJoints[threadIdx.x];

			Dy::SpatialMatrix spatialMatrix;
			loadSpatialMatrix(blockData.mInvSpatialArticulatedInertia, threadIdx.x, spatialMatrix);

			const Cm::UnAlignedSpatialVector motionVelocity0 = loadSpatialVector(data[0].mMotionVelocity, threadIdx.x);

			parentDeltaV = spatialMatrix * -rootDeferredZ;

			Cm::UnAlignedSpatialVector rootVel = motionVelocity0 + parentDeltaV;
			storeParentDeltaV = parentDeltaV;

			//Solve constraints...
			solveStaticConstraints(scDesc, data[0], sharedDesc, rootVel, parentImp, parentDeltaV, threadIdx.x,
				doFriction, minPen, elapsedTime, 0, constraintCounts0, residualReportingEnabled ? &contactError : NULL);
		}

		PxgArticulationTraversalStackData* PX_RESTRICT stack = scDesc->mArticulationTraversalStackBlocks + scDesc->mMaxLinksPerArticulation * blockIdx.x;

		PxU32 parent = 0;

		storeSpatialVector(stack[parent].deltaVStack, storeParentDeltaV, threadIdx.x);
		storeSpatialVector(stack[parent].impulseStack, parentImp, threadIdx.x);

		PxU32 offset = data[parent].mChildrenOffset[threadIdx.x];
		PxU32 numChildren = data[parent].mNumChildren[threadIdx.x];

		for (PxU32 i = 0; i < numChildren; ++i)
		{
			stack[i].indices[threadIdx.x] = offset + i;
		}

		PxU32 stackCount = numChildren;

		PxU32 linkID = ((stackCount != 0) ? stack[stackCount - 1].indices[threadIdx.x] : 0xffffffff);
		ArticulationImplicitDriveDesc implicitDriveDesc(PxZero);	// PT: moved outside of loop because we don't have an empty ctor

		while (stackCount != 0)
		{
			const bool isBackProp = (parent == linkID);

			PxgArticulationBlockLinkData& linkData = data[linkID];

			const float c2px = linkData.mRw_x[threadIdx.x];
			const float c2py = linkData.mRw_y[threadIdx.x];
			const float c2pz = linkData.mRw_z[threadIdx.x];

			const PxU32 jointOffset = linkData.mJointOffset[threadIdx.x];

			const PxU32 dofCount = linkData.mDofs[threadIdx.x];

			if (isBackProp)
			{
				parent = linkData.mParents[threadIdx.x];

				const Cm::UnAlignedSpatialVector solverSpatialImpulse = loadSpatialVector(linkData.mSolverSpatialImpulse, threadIdx.x);

				const Cm::UnAlignedSpatialVector ZInternalConstraint = loadSpatialVector(linkData.mSolverSpatialInternalConstraintImpulse, threadIdx.x);

				PxSpatialMatrix spatialResponse;
				loadSpatialMatrix(data[parent].mSpatialResponseMatrix, threadIdx.x, spatialResponse);

				stackCount--;

				Cm::UnAlignedSpatialVector impulse = parentImp;

				parentDeltaV = loadSpatialVector(stack[parent].deltaVStack, threadIdx.x);
				parentImp = loadSpatialVector(stack[parent].impulseStack, threadIdx.x);

				//Accumulate the solver impulse applied to this link.
				storeSpatialVector(linkData.mSolverSpatialImpulse, solverSpatialImpulse + impulse - ZInternalConstraint, threadIdx.x);

				//We're finished with this link so we can move to the next one.
				linkID = stackCount > 0 ? stack[stackCount - 1].indices[threadIdx.x] : 0xffffffff;

				const Cm::UnAlignedSpatialVector propagateImp = propagateImpulseW_0(PxVec3(c2px, c2py, c2pz), dofData + jointOffset, impulse, dofCount, threadIdx.x);

				parentImp += propagateImp;


				Cm::UnAlignedSpatialVector deltaV = spatialResponse * -parentImp;

				parentDeltaV += deltaV;

				storeSpatialVector(stack[parent].impulseStack, parentImp, threadIdx.x);
			}
			else
			{
				Cm::UnAlignedSpatialVector parentV = parentDeltaV + loadSpatialVector(data[parent].mMotionVelocity, threadIdx.x);
				const Cm::UnAlignedSpatialVector childDelta = loadSpatialVector(linkData.mDeltaMotion, threadIdx.x);
				const Cm::UnAlignedSpatialVector parentDelta = loadSpatialVector(data[parent].mDeltaMotion, threadIdx.x);


				Cm::UnAlignedSpatialVector deltaV = propagateAccelerationW(PxVec3(c2px, c2py, c2pz), dofData + jointOffset,
					parentDeltaV, dofCount, NULL, threadIdx.x);

				Cm::UnAlignedSpatialVector childV = deltaV + loadSpatialVector(linkData.mMotionVelocity, threadIdx.x);

				storeSpatialVector(stack[linkID].deltaVStack, deltaV, threadIdx.x);

				Cm::UnAlignedSpatialVector impulse = Cm::UnAlignedSpatialVector(PxVec3(0.f), PxVec3(0.f));

				for (PxU32 dof = 0; dof < dofCount; ++dof)
				{
					PxgArticulationBlockDofData& PX_RESTRICT thisDof = dofData[jointOffset + dof];
					const PxU32 motion = thisDof.mMotion[threadIdx.x];
					if (motion != PxArticulationMotion::eLOCKED)
					{
						const PxReal maxJointVel = thisDof.mConstraintData.mMaxJointVelocity[threadIdx.x];
						// PT: preload as much data as we can
						const Cm::UnAlignedSpatialVector row0 = loadSpatialVector(thisDof.mConstraintData.mRow0, threadIdx.x);
						const Cm::UnAlignedSpatialVector row1 = loadSpatialVector(thisDof.mConstraintData.mRow1, threadIdx.x);
						const PxReal maxDriveForce = thisDof.mConstraintData.mConstraintMaxImpulse[threadIdx.x];
						PxReal appliedDriveImpulse = thisDof.mConstraintData.mDriveImpulse[threadIdx.x];
						const PxReal recipResponse = thisDof.mConstraintData.mRecipResponse[threadIdx.x];
						const PxReal response = thisDof.mConstraintData.mResponse[threadIdx.x];
						const PxReal maxFrictionForce = thisDof.mConstraintData.mMaxFrictionForce[threadIdx.x];
						const Cm::UnAlignedSpatialVector deltaVA = loadSpatialVector(thisDof.mConstraintData.mDeltaVA, threadIdx.x);
						const Cm::UnAlignedSpatialVector deltaVB = loadSpatialVector(thisDof.mConstraintData.mDeltaVB, threadIdx.x);

						const bool perSubset = isTGS && isExternalForceEveryStep && !isVelIter;
						const PxReal effectiveDt = perSubset ? dt : scDesc->dt;

						const PxReal staticFrictionImpulse = thisDof.mConstraintData.mStaticFrictionEffort[threadIdx.x] * effectiveDt;
						const PxReal dynamicFrictionImpulse = thisDof.mConstraintData.mDynamicFrictionEffort[threadIdx.x] * effectiveDt;
						const PxReal viscousFrictionCoefficient = thisDof.mConstraintData.mViscousFrictionCoefficient[threadIdx.x] * effectiveDt;

						const PxReal maxImpulse = thisDof.mConstraintData.mMaxEffort[threadIdx.x] * effectiveDt;
						const PxReal speedImpulseGradient = thisDof.mConstraintData.mSpeedEffortGradient[threadIdx.x] / effectiveDt;
						const PxReal velocityDependentResistance = thisDof.mConstraintData.mVelocityDependentResistance[threadIdx.x] * effectiveDt;
						const PxReal externalJointImpulse = articulation.jointForce[jointOffset+dof] * effectiveDt;
						const PxReal maxActuatorVelocity = thisDof.mConstraintData.mMaxActuatorVelocity[threadIdx.x];

						if (!(isTGS && isVelIter))
							implicitDriveDesc = thisDof.mConstraintData.getImplicitDriveDesc(threadIdx.x);

						PxReal jointV = row1.innerProduct(childV) - row0.innerProduct(parentV);

						const PxReal jointDeltaP = row1.innerProduct(childDelta) - row0.innerProduct(parentDelta);

						PxReal frictionDeltaF = 0.0f;
						bool newFrictionModel = staticFrictionImpulse != 0.0f || viscousFrictionCoefficient !=  0.0f;

						// deprecated friction model
						if (!newFrictionModel)
						{
							// Friction force is accumulated through all position iterations only for PGS
							const PxReal appliedFriction = isTGS ? 0.0f : thisDof.mConstraintData.mAccumulatedFrictionImpulse[threadIdx.x];

							const PxReal frictionForce = PxClamp(-jointV * recipResponse + appliedFriction,
																-maxFrictionForce, maxFrictionForce);
							thisDof.mConstraintData.mAccumulatedFrictionImpulse[threadIdx.x] = frictionForce; // This is not used for TGS

							frictionDeltaF = frictionForce - appliedFriction;

							jointV += frictionDeltaF * response;
						}

						PxReal driveDeltaF = 0.0f;

						if (maxImpulse > 0.0f)
						{
							appliedDriveImpulse = perSubset ? 0.0f: appliedDriveImpulse;

							const PxReal unclampedImpulse = (isTGS && isVelIter)
								? appliedDriveImpulse
								: computeDriveImpulse(appliedDriveImpulse, jointV, jointDeltaP, elapsedTime, implicitDriveDesc);

							const PxReal clampedImpulse = clampDriveImpulse(
								jointV,
								appliedDriveImpulse + externalJointImpulse,
								unclampedImpulse + externalJointImpulse,
								response,
								maxActuatorVelocity,
								maxImpulse,
								speedImpulseGradient,
								velocityDependentResistance
							) - externalJointImpulse;

							driveDeltaF = clampedImpulse - appliedDriveImpulse; // to keep track of accumulated impulse for velIter in TGS with isExternalForceEveryStep
							thisDof.mConstraintData.mDriveImpulse[threadIdx.x] += driveDeltaF;
						}

						else
						{
							const PxReal unclampedForce = (isTGS && isVelIter) ? appliedDriveImpulse :
									computeDriveImpulse(appliedDriveImpulse, jointV, jointDeltaP, elapsedTime,
										implicitDriveDesc);

							const PxReal clampedForce = PxClamp(unclampedForce, -maxDriveForce, maxDriveForce);
									driveDeltaF = (clampedForce - appliedDriveImpulse);

							thisDof.mConstraintData.mDriveImpulse[threadIdx.x] = clampedForce;
						}
						jointV += driveDeltaF * response;

						if (newFrictionModel)
						{
							const PxReal appliedFriction = isTGS && isExternalForceEveryStep && !isVelIter ? 0.0f : thisDof.mConstraintData.mAccumulatedFrictionImpulse[threadIdx.x];
							PxReal totalImpulse = appliedFriction - jointV * recipResponse;
							totalImpulse = computeFrictionImpulse(totalImpulse, staticFrictionImpulse, dynamicFrictionImpulse, viscousFrictionCoefficient, jointV);
							frictionDeltaF = totalImpulse - appliedFriction;
							thisDof.mConstraintData.mAccumulatedFrictionImpulse[threadIdx.x] += frictionDeltaF; // to keep track of accumulated impulse for velIter in TGS with isExternalForceEveryStep
							jointV += frictionDeltaF * response;
						}

						PxReal posLimitDeltaF = 0.0f;
						if (motion == PxArticulationMotion::eLIMITED)
						{
							const PxReal errorLow = thisDof.mConstraintData.mLimitError_LowX_highY[threadIdx.x].x;
							const PxReal errorHigh = thisDof.mConstraintData.mLimitError_LowX_highY[threadIdx.x].y;
							PxReal& lowImp = thisDof.mConstraintData.mLowImpulse[threadIdx.x];
							PxReal& highImp = thisDof.mConstraintData.mHighImpulse[threadIdx.x];
							posLimitDeltaF = computeLimitImpulse(
								dt, invDt, isVelIter,
								response, recipResponse, erp,
								errorLow, errorHigh, jointDeltaP,
								lowImp, highImp, jointV);
						}

						PxReal velLimitDeltaF = 0.0f;
						if (PxAbs(jointV) > maxJointVel)
						{
							const PxReal newJointV = PxClamp(jointV, -maxJointVel, maxJointVel);
							velLimitDeltaF = (newJointV - jointV) * recipResponse;
							jointV = newJointV;
						}

						const PxReal deltaF = frictionDeltaF + driveDeltaF + posLimitDeltaF + velLimitDeltaF;

						//Accumulate error even if it is zero because the increment of the counter affects the RMS value
						if (residualReportingEnabled)
							error.accumulateErrorLocal(deltaF, recipResponse);

						//if (deltaF != 0.f)
						{
							// the signs look suspicious here
							const Cm::UnAlignedSpatialVector pDelta = deltaVA * -deltaF;
							const Cm::UnAlignedSpatialVector cDelta = deltaVB * -deltaF;

							parentDeltaV += pDelta;
							deltaV += cDelta;

							parentV += pDelta;
							childV += cDelta;

							//KS - TODO - remove msImpulses and msDeltaV from here!
							parentImp += row0 * deltaF;
							impulse -= row1 * deltaF;
						}
					}
				}

				const PxU32 constraintCounts = linkData.mNbStaticJoints[threadIdx.x];

				numChildren = linkData.mNumChildren[threadIdx.x];	// PT: preload to avoid stall

				//Store the internal constraint impulse applied to this link on this solver iteration.
				storeSpatialVector(linkData.mSolverSpatialInternalConstraintImpulse, impulse, threadIdx.x);

				solveStaticConstraints(scDesc, linkData, sharedDesc, childV, impulse, deltaV, threadIdx.x,
					doFriction, minPen, elapsedTime, linkID, constraintCounts, residualReportingEnabled ? &contactError : NULL);

				storeSpatialVector(stack[parent].impulseStack, parentImp, threadIdx.x);
				storeSpatialVector(stack[linkID].impulseStack, impulse, threadIdx.x);

				{
					parent = linkID;
					//if there are no children under the current link, we don't change the linkID so parent index
					//will be the same as linkID
					if (numChildren > 0)
					{
						offset = linkData.mChildrenOffset[threadIdx.x];
						for (PxU32 i = 0; i < numChildren; ++i)
						{
							stack[stackCount++].indices[threadIdx.x] = offset + i;
						}

						linkID = stack[stackCount - 1].indices[threadIdx.x];
					}

					parentDeltaV = deltaV;
					parentImp = impulse;

				}
			}
		}

		storeSpatialVector(blockData.mRootDeferredZ, rootDeferredZ + parentImp, threadIdx.x);

		if (residualReportingEnabled)
		{
			error.accumulateErrorGlobalNoAtomics(articulation.internalResidualAccumulator);
			contactError.accumulateErrorGlobalNoAtomics(articulation.contactResidualAccumulator);
		}
	}
}


static __device__ void updateSolveInternalTendonConstraintsTGS(
	PxgArticulationBlockData& artiBlock,
	PxgArticulationBlockLinkData* PX_RESTRICT artiLinks,
	PxgArticulationBlockSpatialTendonData* PX_RESTRICT artiTendons,
	PxgArticulationInternalTendonConstraintData* PX_RESTRICT artiTendonConstraints,
	PxgArticulationBlockAttachmentData* PX_RESTRICT artiAttachments,
	const PxU32 maxAttachments,
	const PxU32 threadIndexInWarp
	)
{
	const PxU32 numTendons = artiBlock.mNumSpatialTendons[threadIndexInWarp];

	PxReal accumLength[DY_ARTICULATION_TENDON_MAX_SIZE];

	for (PxU32 i = 0; i < numTendons; ++i)
	{
		const PxgArticulationBlockSpatialTendonData& tendonData = artiTendons[i];

		const PxReal offset = tendonData.mOffset[threadIndexInWarp];

		PxgArticulationBlockAttachmentData* attachmentBlock = &artiAttachments[i * maxAttachments];
		PxgArticulationInternalTendonConstraintData* constraintBlock = &artiTendonConstraints[i * maxAttachments];

		PxU64 bitStack = attachmentBlock[0].mChildrens[threadIndexInWarp];

		const PxReal coefficient = attachmentBlock[0].mCoefficient[threadIndexInWarp];

		PxU32 stackCount = __popcll(bitStack);

		PxU32 parent = 0;

		PxU32 numConstraints = 0;

		accumLength[parent] = offset * coefficient;


		PxU32 child = 63 - __clzll(bitStack);

		while (stackCount != 0)
		{
			stackCount--;

			PxgArticulationBlockAttachmentData& attachmentData = attachmentBlock[child];
			PxgArticulationBlockAttachmentData& pAttachmentData = attachmentBlock[parent];

			const PxU32 linkInd = attachmentData.mLinkIndex[threadIndexInWarp];
			PxgArticulationBlockLinkData& cLink = artiLinks[linkInd];
			const PxU32 pLinkInd = pAttachmentData.mLinkIndex[threadIndexInWarp];
			PxgArticulationBlockLinkData& pLink = artiLinks[pLinkInd];

			const PxTransform cBody2World = loadSpatialTransform(cLink.mAccumulatedPose, threadIndexInWarp);
			const PxTransform pBody2World = loadSpatialTransform(pLink.mAccumulatedPose, threadIndexInWarp);

			const PxVec3 rb = cBody2World.q.rotate(attachmentData.mRelativeOffset[threadIndexInWarp]);
			const PxVec3 ra = pBody2World.q.rotate(pAttachmentData.mRelativeOffset[threadIndexInWarp]);

			const PxVec3 cAttachPoint = cBody2World.p + rb;
			const PxVec3 pAttachPoint = pBody2World.p + ra;

			const PxVec3 dif = pAttachPoint - cAttachPoint;
			const PxReal distanceSq = dif.magnitudeSquared();
			const PxReal distance = PxSqrt(distanceSq);

			const PxReal u = distance * attachmentData.mCoefficient[threadIndexInWarp] + accumLength[parent];

			PxU64 children = attachmentData.mChildrens[threadIndexInWarp];

			if (children)
			{
				const PxU32 numChildrens = __popcll(children);
				stackCount += numChildrens;
				accumLength[child] = u;
			}
			else
			{
				PxgArticulationInternalTendonConstraintData& constraint = constraintBlock[numConstraints++];
				constraint.mAccumulatedLength[threadIndexInWarp] = u;

			}


			if (stackCount > 0)
			{
				//clear child
				bitStack &= (~(1ull << child));

				//add on children to the stack
				bitStack |= children;

				//pop up the next child from stack
				child = 63 - __clzll(bitStack);

				//assign the parent with next child's parent
				parent = attachmentBlock[child].mParents[threadIndexInWarp];
			}

		}

	}
}

static __device__ Cm::UnAlignedSpatialVector pxcFsGetVelocity(
	PxgArticulationBlockData& artiBlock,
	PxgArticulationBlockLinkData* linkData,
	PxgArticulationBlockDofData* dofData,
	PxgArticulationBitFieldStackData* linkBitFieldData,
	const PxU32 wordSize, const PxU32 linkID, const bool fixBase,
	PxReal* jointDofSpeeds,
	const PxU32 threadIndexInWarp)
{
	Cm::UnAlignedSpatialVector deltaV(PxVec3(0.f), PxVec3(0.f));

	if (!fixBase)
	{
		const Cm::UnAlignedSpatialVector rootDeferredZ = loadSpatialVector(artiBlock.mRootDeferredZ, threadIndexInWarp);

		Dy::SpatialMatrix spatialMatrix;

		loadSpatialMatrix(artiBlock.mInvSpatialArticulatedInertia, threadIndexInWarp, spatialMatrix);

		deltaV = spatialMatrix * -rootDeferredZ;
	}

	PxReal deltaJointDofSpeeds[3] = {0, 0, 0};

	PxgArticulationBlockLinkData& link = linkData[linkID];

	for (PxU32 j = 0, wordOffset = 0; j < wordSize; ++j, wordOffset += 64)
	{
		PxU64 pathToRoot = linkBitFieldData[linkID * wordSize + j].bitField[threadIndexInWarp];

		while (pathToRoot != 0)
		{
			const PxU32 index = articulationLowestSetBit(pathToRoot) + wordOffset;

			if (index != 0)
			{
				PxgArticulationBlockLinkData& cLink = linkData[index];

				const PxU32 jointOffset = cLink.mJointOffset[threadIndexInWarp];
				const PxU32 dofCount = cLink.mDofs[threadIndexInWarp];

				const float rwx = cLink.mRw_x[threadIdx.x];
				const float rwy = cLink.mRw_y[threadIdx.x];
				const float rwz = cLink.mRw_z[threadIdx.x];

				const PxVec3 childToParent(rwx, rwy, rwz);

				//Compute the deltaqDot on the inbound joint of linkID.
				PxReal* optionalDeltaJointSpeeds = ((linkID == index) && jointDofSpeeds) ? deltaJointDofSpeeds : NULL;

				deltaV = propagateAccelerationW(childToParent, dofData + jointOffset, deltaV, dofCount, optionalDeltaJointSpeeds, threadIndexInWarp);
			}

			//clear the lowest bit
			pathToRoot &= (pathToRoot - 1);
		}
	}

	//Optionally report the updated joint speed after accounting for the delta joint dof speed arising from the deferred impulses.
	if(jointDofSpeeds)
	{
		const PxU32 dofCount = link.mDofs[threadIndexInWarp];
		const PxU32 jointOffset = link.mJointOffset[threadIndexInWarp];
		for(PxU32 i = 0; i < dofCount; i++)
		{
			jointDofSpeeds[i] = dofData[jointOffset + i].mJointVelocities[threadIndexInWarp] + deltaJointDofSpeeds[i];
		}
	}

	Cm::UnAlignedSpatialVector motionVelocity = loadSpatialVector(link.mMotionVelocity, threadIndexInWarp);

	return motionVelocity + deltaV;
}

static __device__ void pxcFsApplyImpulses(PxgArticulationBlockData& blockData,
	PxgArticulationBlockLinkData* PX_RESTRICT linkData,
	PxgArticulationBlockDofData* PX_RESTRICT dofData,
	PxgArticulationBitFieldStackData* PX_RESTRICT linkBitFields, const PxU32 linkBitFieldWordCount,
	const PxU32 linkID0, const PxVec3& linear0, const PxVec3& angular0, const PxReal* PX_RESTRICT jointImpulse0,
	const PxU32 linkID1, const PxVec3& linear1, const PxVec3& angular1, const PxReal* PX_RESTRICT jointImpulse1,
	const PxU32 threadIndexInWarp)
{
	PxU64 commonId = 0;
	PxU32 commonLink = 0;
	for (PxI32 i = linkBitFieldWordCount -1 ; i >= 0; --i)
	{
		const PxU64 wordA = linkBitFields[linkID0 * linkBitFieldWordCount + i].bitField[threadIndexInWarp];
		const PxU64 wordB = linkBitFields[linkID1 * linkBitFieldWordCount + i].bitField[threadIndexInWarp];
		commonId = wordA & wordB;
		if (commonId != 0)
		{
			commonLink = articulationHighestSetBit(commonId) + i * 64;
			break;
		}
	}


	Cm::UnAlignedSpatialVector Z0(-linear0, -angular0);
	Cm::UnAlignedSpatialVector Z1(-linear1, -angular1);

	//The common link will either be linkID1, or its ancestors.
	//The common link cannot be an index before either linkID1 or linkID0
	for (PxU32 i = linkID1; i != commonLink; i = linkData[i].mParents[threadIndexInWarp])
	{
		PxgArticulationBlockLinkData& tlink = linkData[i];
		const PxU32 jointOffset = tlink.mJointOffset[threadIndexInWarp];
		const PxU32 dofCount = tlink.mDofs[threadIndexInWarp];

		const float rwx = tlink.mRw_x[threadIndexInWarp];
		const float rwy = tlink.mRw_y[threadIndexInWarp];
		const float rwz = tlink.mRw_z[threadIndexInWarp];

		const PxVec3 child2Parent(rwx, rwy, rwz);

		//Only apply the joint impulse to the inbound joint of linkID1.
		//Note: linkID1 might be the common link. If this is the case, we will only apply
		//jointImpulse1 when we propagate from the common link to the root.
		//Watch out for that when we propagate from the common link.
		const PxReal* jointImpulseToApply = (linkID1 == i) ? jointImpulse1 : NULL;

		addSpatialVector(linkData[i].mSolverSpatialImpulse, Z1, threadIndexInWarp);

		Z1 = propagateImpulseW_0(child2Parent, dofData + jointOffset, Z1, dofCount, threadIndexInWarp, jointImpulseToApply, 1.0f);
	}

	for (PxU32 i = linkID0; i != commonLink; i = linkData[i].mParents[threadIndexInWarp])
	{
		PxgArticulationBlockLinkData& tlink = linkData[i];
		const PxU32 jointOffset = tlink.mJointOffset[threadIndexInWarp];
		const PxU32 dofCount = tlink.mDofs[threadIndexInWarp];

		const float rwx = tlink.mRw_x[threadIndexInWarp];
		const float rwy = tlink.mRw_y[threadIndexInWarp];
		const float rwz = tlink.mRw_z[threadIndexInWarp];

		const PxVec3 child2Parent(rwx, rwy, rwz);

		//Only apply the joint impulse to the inbound joint of linkID0.
		//Note: linkID0 might be the common link. If this is the case, we will only apply
		//jointImpulse0 when we propagate from the common link to the root.
		//Watch out for that when we propagate from the common link.
		const PxReal* jointImpulseToApply = (linkID0 == i) ? jointImpulse0 : NULL;

		addSpatialVector(linkData[i].mSolverSpatialImpulse, Z0, threadIndexInWarp);

		Z0 = propagateImpulseW_0(child2Parent, dofData + jointOffset, Z0, dofCount, threadIndexInWarp, jointImpulseToApply, 1.0f);
	}

	Cm::UnAlignedSpatialVector ZCommon = Z0 + Z1;

	//If either linkID0 (or linkID1) are the common link then we will not yet have applied
	//jointImpulse0 (or jointImpulse1) to the inbound joint of the link.
	//Work out how much joint impulse to apply to the inbound joint of the common link.
	PxReal jointImpulseToApplyAtCommonLink[3] = {0, 0, 0};
	if(((linkID0 == commonLink) && jointImpulse0) || ((linkID1 == commonLink) && jointImpulse1))
	{
		const PxU32 linkIndices[2] = {linkID0, linkID1};
		const PxReal* jointImpulses[2]= {jointImpulse0, jointImpulse1};
		const PxU32 dofCountAtCommonLink = linkData[commonLink].mDofs[threadIndexInWarp];
		for(PxU32 k = 0; k < 2; k++)
		{
			const PxU32 linkId = linkIndices[k];
			const PxReal* jointImpulse = jointImpulses[k];
			if((linkId == commonLink) && jointImpulse)
			{
				for(PxU32 i = 0; i < dofCountAtCommonLink; i++)
				{
					jointImpulseToApplyAtCommonLink[i] += jointImpulse[i];
				}
			}
		}
	}

	for (PxU32 i = commonLink; i; i = linkData[i].mParents[threadIndexInWarp])
	{
		PxgArticulationBlockLinkData& tlink = linkData[i];

		const PxU32 jointOffset = tlink.mJointOffset[threadIndexInWarp];
		const PxU32 dofCount = tlink.mDofs[threadIndexInWarp];

		const float rwx = tlink.mRw_x[threadIndexInWarp];
		const float rwy = tlink.mRw_y[threadIndexInWarp];
		const float rwz = tlink.mRw_z[threadIndexInWarp];

		const PxVec3 child2Parent(rwx, rwy, rwz);


		//Only apply a joint impulse to the inbound joint of commonLink.
		//The joint impulse to apply to the inbound joint will only be non-zero if the common link
		//is either linkID0 or linkID1.
		const PxReal* jointImpulseToApply = (commonLink == i) ? jointImpulseToApplyAtCommonLink : NULL;

		addSpatialVector(linkData[i].mSolverSpatialImpulse, ZCommon, threadIndexInWarp);

		ZCommon = propagateImpulseW_0(child2Parent, dofData + jointOffset, ZCommon, dofCount, threadIndexInWarp, jointImpulseToApply, 1.0f);
	}

	addSpatialVector(blockData.mRootDeferredZ, ZCommon, threadIndexInWarp);
}


static __device__ void solveInternalSpatialConstraints(
	const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
	const bool isTGS, const PxU32 threadIndexInWarp)
{
	const PxU32 maxLinks = scDesc->mMaxLinksPerArticulation;

	PxgArticulationBlockLinkData* data = scDesc->mArticulationLinkBlocks + maxLinks * blockIdx.x;
	PxgArticulationBlockDofData* dofData = scDesc->mArticulationDofBlocks + scDesc->mMaxDofsPerArticulation * blockIdx.x;

	const PxU32 wordSize = (maxLinks + 63 )/ 64;
	PxgArticulationBitFieldData* linkBitFieldsData = scDesc->mPathToRootBitFieldBlocks + maxLinks * wordSize * blockIdx.x;

	const PxU32 maxTendons = scDesc->mMaxSpatialTendonsPerArticulation;
	const PxU32 maxAttachments = scDesc->mMaxAttachmentPerArticulation;

	PxgArticulationBlockSpatialTendonData* tendonData = scDesc->mArticulationSpatialTendonBlocks + maxTendons * blockIdx.x;
	PxgArticulationInternalTendonConstraintData* tendonConstraintData = scDesc->mArticulationSpatialTendonConstraintBlocks + maxTendons * maxAttachments * blockIdx.x;
	PxgArticulationBlockAttachmentData* attachmentData = scDesc->mArticulationAttachmentBlocks + maxTendons * maxAttachments * blockIdx.x;
	PxgArticulationBlockData& blockData = scDesc->mArticulationBlocks[blockIdx.x];

	const bool fixBase = blockData.mFlags[threadIndexInWarp] & PxArticulationFlag::eFIX_BASE;


	if (isTGS)
	{
		//compute the accumulated errors
		updateSolveInternalTendonConstraintsTGS(blockData, data, tendonData, tendonConstraintData, attachmentData, maxAttachments, threadIndexInWarp);
	}


	const PxU32 numTendons = blockData.mNumSpatialTendons[threadIndexInWarp];

	for (PxU32 i = 0; i < numTendons; ++i)
	{
		const PxgArticulationBlockSpatialTendonData& tendonBlock = tendonData[i];

		PxgArticulationInternalTendonConstraintData* tendonConstraintBlock = &tendonConstraintData[i * maxAttachments];

		const PxU32 numConstraints = tendonBlock.mNumConstraints[threadIndexInWarp];

		//for the internal tendon constraint, the parent link will be extractly the same for all the constraints. Therefore, we can
		//precompute the velocity of parent

		if (numConstraints > 0)
		{

			PxgArticulationInternalTendonConstraintData& constraintData = tendonConstraintBlock[numConstraints - 1];
			const PxU32 parentID = constraintData.mLink0[threadIndexInWarp];
			Cm::UnAlignedSpatialVector parentVel = pxcFsGetVelocity(blockData, data, dofData, linkBitFieldsData, wordSize, parentID, fixBase, NULL, threadIndexInWarp);

			PxReal parentV = loadSpatialVector(tendonConstraintBlock[0].mRow0, threadIndexInWarp).innerProduct(parentVel);

			for (PxI32 j = numConstraints - 1; j >= 0; --j)
			{

				PxgArticulationInternalTendonConstraintData& constraintData = tendonConstraintBlock[j];

				assert(parentID == constraintData.mLink0[threadIndexInWarp]);
				const PxU32 childID = constraintData.mLink1[threadIndexInWarp];

				Cm::UnAlignedSpatialVector childVel = pxcFsGetVelocity(blockData, data, dofData, linkBitFieldsData, wordSize, childID, fixBase, NULL, threadIndexInWarp);

				const PxReal accumLength = constraintData.mAccumulatedLength[threadIndexInWarp];
				const PxReal error = constraintData.mRestDistance[threadIndexInWarp] - accumLength;

				PxReal error2 = 0.f;
				const PxReal lowLimit = constraintData.mLowLimit[threadIndexInWarp];
				const PxReal highLimit = constraintData.mHighLimit[threadIndexInWarp];

				if (accumLength > highLimit)
					error2 = highLimit - accumLength;
				else if (accumLength < lowLimit)
					error2 = lowLimit - accumLength;

				const Cm::UnAlignedSpatialVector row1 = loadSpatialVector(constraintData.mRow1, threadIndexInWarp);
				const Cm::UnAlignedSpatialVector row0 = loadSpatialVector(constraintData.mRow0, threadIndexInWarp);

				const PxReal jointV = row1.innerProduct(childVel) - parentV;

				const PxReal velMultiplier = constraintData.mVelMultiplier[threadIndexInWarp];
				const PxReal biasCoefficient = constraintData.mBiasCoefficient[threadIndexInWarp];
				const PxReal appliedForce = constraintData.mAppliedForce[threadIndexInWarp];
				const PxReal impulseMultiplier = constraintData.mImpulseMultiplier[threadIndexInWarp];

				const PxReal limitBiasCoefficient = constraintData.mLimitBiasCoefficient[threadIndexInWarp];
				const PxReal limitAppiledForce = constraintData.mLimitAppliedForce[threadIndexInWarp];
				const PxReal limitImpulseMultiplier = constraintData.mLimitImpulseMultiplier[threadIndexInWarp];

				const PxReal unclampedForce = jointV * velMultiplier + error * biasCoefficient + appliedForce * impulseMultiplier;

				PxReal unclampedForce2 = (error2 * limitBiasCoefficient) + limitAppiledForce * limitImpulseMultiplier;

				const PxReal deltaF = (unclampedForce - appliedForce) + (unclampedForce2 - limitAppiledForce);

				constraintData.mAppliedForce[threadIndexInWarp] = unclampedForce;

				constraintData.mLimitAppliedForce[threadIndexInWarp] = unclampedForce2;

				parentV += constraintData.mDeltaVA[threadIndexInWarp] * -deltaF;

				if (deltaF != 0.f)
				{
					Cm::UnAlignedSpatialVector i0 = row0 * (-deltaF);
					Cm::UnAlignedSpatialVector i1 = row1 * deltaF;

					pxcFsApplyImpulses(
						blockData, data, dofData,
						linkBitFieldsData, wordSize,
						parentID, i0.top, i0.bottom, NULL,
						childID, i1.top, i1.bottom, NULL,
						threadIndexInWarp);
				}
			}
		}
	}
}


static __device__ void solveInternalFixedConstraints(
	const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
	const bool isTGS, const PxU32 threadIndexInWarp)
{

	const PxU32 maxLinks = scDesc->mMaxLinksPerArticulation;
	const PxU32 wordSize = (maxLinks + 63) / 64;

	PxgArticulationBlockLinkData* artiLinks = scDesc->mArticulationLinkBlocks + maxLinks * blockIdx.x;
	PxgArticulationBlockDofData* artiDofs = scDesc->mArticulationDofBlocks + scDesc->mMaxDofsPerArticulation * blockIdx.x;

	PxgArticulationBitFieldData* linkBitFields = scDesc->mPathToRootBitFieldBlocks + maxLinks * wordSize * blockIdx.x;
	const PxU32 maxFixedTendons = scDesc->mMaxFixedTendonsPerArticulation;
	const PxU32 maxTendonJoints = scDesc->mMaxTendonJointPerArticulation;

	PxgArticulationBlockFixedTendonData* artiTendon = scDesc->mArticulationFixedTendonBlocks + maxFixedTendons * blockIdx.x;
	PxgArticulationInternalTendonConstraintData* artiTendonConstraints = scDesc->mArticulationFixedTendonConstraintBlocks + maxFixedTendons * maxTendonJoints * blockIdx.x;
	PxgArticulationBlockTendonJointData* artiTendonJoints = scDesc->mArticulationTendonJointBlocks + maxFixedTendons * maxTendonJoints * blockIdx.x;
	PxgArticulationBlockData& blockData = scDesc->mArticulationBlocks[blockIdx.x];


	const bool fixBase = blockData.mFlags[threadIndexInWarp] & PxArticulationFlag::eFIX_BASE;
	const PxU32 numTendons = blockData.mNumFixedTendons[threadIndexInWarp];

	for (PxU32 i = 0; i < numTendons; ++i)
	{
		PxgArticulationBlockFixedTendonData& tendonData = artiTendon[i];

		const PxReal lowLimit = tendonData.mLowLimit[threadIndexInWarp];
		const PxReal highLimit = tendonData.mHighLimit[threadIndexInWarp];

		PxgArticulationBlockTendonJointData* tendonJointBlock = &artiTendonJoints[i * maxTendonJoints];
		PxgArticulationInternalTendonConstraintData* constraintBlock = &artiTendonConstraints[i * maxTendonJoints];

		PxU64 bitStack = tendonJointBlock[0].mChildrens[threadIndexInWarp];

		PxU32 stackCount = __popcll(bitStack);

		PxU32 parent = 0;

		PxgArticulationBlockTendonJointData& sTendonJointData = tendonJointBlock[parent];
		const PxU32 sLinkIndex = sTendonJointData.mLinkIndex[threadIndexInWarp];
		PxgArticulationBlockLinkData& sLink = artiLinks[sLinkIndex];
		const PxTransform sBody2World = loadSpatialTransform(sLink.mAccumulatedPose, threadIndexInWarp);

		Cm::UnAlignedSpatialVector parentVel = pxcFsGetVelocity(blockData, artiLinks, artiDofs, linkBitFields, wordSize, sLinkIndex, fixBase, NULL, threadIndexInWarp);
		Cm::UnAlignedSpatialVector velA = loadSpatialVector(sLink.mMotionVelocity, threadIndexInWarp);

		Cm::UnAlignedSpatialVector delta = parentVel - velA;
		storeSpatialVector(sLink.mScratchImpulse, Cm::UnAlignedSpatialVector::Zero(), threadIndexInWarp);
		storeSpatialVector(sLink.mScratchDeltaV, delta, threadIndexInWarp);


		PxReal rootImp = 0.f;
		PxU64 totalStack = 0;


		PxReal error = 0.f;
		PxReal velocity = 0.f;

		const PxU32 firstChild = 63 - __clzll(bitStack);

		totalStack |= bitStack;


		while (stackCount != 0)
		{

			PxU32 child = 63 - __clzll(bitStack);
			bitStack &= (~(1ull << (child)));
			stackCount--;

			PxgArticulationBlockTendonJointData& tjData = tendonJointBlock[child];

			const PxU32 cLinkInd = tjData.mLinkIndex[threadIndexInWarp];
			PxgArticulationBlockLinkData& cLink = artiLinks[cLinkInd];
			const PxU32 pLinkInd = cLink.mParents[threadIndexInWarp];
			PxgArticulationBlockLinkData& pLink = artiLinks[pLinkInd];
			const PxU32 jointOffset = cLink.mJointOffset[threadIndexInWarp];
			const PxU32 dofCount = cLink.mDofs[threadIndexInWarp];

			const float rwx = cLink.mRw_x[threadIndexInWarp];
			const float rwy = cLink.mRw_y[threadIndexInWarp];
			const float rwz = cLink.mRw_z[threadIndexInWarp];

			const PxVec3 c2p(rwx, rwy, rwz);

			const PxU32 constraintId = tjData.mConstraintId[threadIndexInWarp];


			PxgArticulationInternalTendonConstraintData& constraint = constraintBlock[constraintId];

			const PxU32 tjAxis = tjData.mAxis[threadIndexInWarp];
			const PxReal coefficient = tjData.mCoefficient[threadIndexInWarp];

			const PxU32 dofIndex = cLink.mInvDofIds[tjAxis][threadIndexInWarp];
			PxgArticulationBlockDofData& dofData = artiDofs[jointOffset + dofIndex];

			const PxReal jointPose = dofData.mJointPositions[threadIndexInWarp];

			Cm::UnAlignedSpatialVector parentDeltaV = loadSpatialVector(pLink.mScratchDeltaV, threadIndexInWarp);
			Cm::UnAlignedSpatialVector parentV = loadSpatialVector(pLink.mMotionVelocity, threadIndexInWarp) + parentDeltaV;

			Cm::UnAlignedSpatialVector cDeltaV = propagateAccelerationW(c2p, artiDofs + jointOffset, parentDeltaV, dofCount, NULL, threadIndexInWarp);

			storeSpatialVector(cLink.mScratchDeltaV, cDeltaV, threadIndexInWarp);


			Cm::UnAlignedSpatialVector velB = loadSpatialVector(cLink.mMotionVelocity, threadIndexInWarp);
			Cm::UnAlignedSpatialVector childVel = velB + cDeltaV;

			storeSpatialVector(cLink.mScratchImpulse, Cm::UnAlignedSpatialVector(PxVec3(0.f), PxVec3(0.f)), threadIndexInWarp);


			PxU64 children = tjData.mChildrens[threadIndexInWarp];

			//KS - constraint.row0.innerProduct(rootVel) is a known so we can replace it with a dynamically updated scalar value...
			Cm::UnAlignedSpatialVector row0 = loadSpatialVector(constraint.mRow0, threadIndexInWarp);
			Cm::UnAlignedSpatialVector row1 = loadSpatialVector(constraint.mRow1, threadIndexInWarp);
			PxReal jointV = row1.innerProduct(childVel) - row0.innerProduct(parentV);


			error += jointPose * coefficient;
			velocity += jointV * coefficient;

			//Add myself in the list to propagate up changes with my children back up the system
			//assign child to parent
			parent = child;

			if (children)
			{
				stackCount += __popcll(children);
				//add on children to the stack
				bitStack |= children;
				totalStack |= children;
			}
		}

		const PxU32 count = __popcll(totalStack);
		const PxReal scale = count ? 1.f / PxReal(count) : 0.f;

		const PxReal length = error + tendonData.mOffset[threadIndexInWarp];


		PxReal limitError = 0.f;
		if (length < lowLimit)
			limitError = length - lowLimit;
		else if (length > highLimit)
			limitError = length - highLimit;

		error = (length - tendonData.mRestLength[threadIndexInWarp])*scale;
		limitError *= scale;
		velocity *= scale;

		//Once we get here, we've got the full stack...

		while (totalStack != 0)
		{
			PxU32 child = 63 - __clzll(totalStack);
			totalStack &= (~(1ull << (child)));

			PxgArticulationBlockTendonJointData& tjData = tendonJointBlock[child];

			const PxU32 cLinkInd = tjData.mLinkIndex[threadIndexInWarp];
			PxgArticulationBlockLinkData& cLink = artiLinks[cLinkInd];
			const PxU32 pLinkInd = cLink.mParents[threadIndexInWarp];
			PxgArticulationBlockLinkData& pLink = artiLinks[pLinkInd];
			const PxU32 jointOffset = cLink.mJointOffset[threadIndexInWarp];
			const PxU32 dofCount = cLink.mDofs[threadIndexInWarp];

			const float rwx = cLink.mRw_x[threadIndexInWarp];
			const float rwy = cLink.mRw_y[threadIndexInWarp];
			const float rwz = cLink.mRw_z[threadIndexInWarp];

			const PxVec3 c2p(rwx, rwy, rwz);

			const PxU32 constraintId = tjData.mConstraintId[threadIndexInWarp];

			PxgArticulationInternalTendonConstraintData& constraint = constraintBlock[constraintId];

			const PxReal recipCoefficient = tjData.mRecipCoefficient[threadIndexInWarp];
			const PxReal velMultiplier = constraint.mVelMultiplier[threadIndexInWarp];
			const PxReal biasCoefficient = constraint.mBiasCoefficient[threadIndexInWarp];
			const PxReal limitBiasCoefficient = constraint.mLimitBiasCoefficient[threadIndexInWarp];
			const PxReal impulseMultiplier = constraint.mImpulseMultiplier[threadIndexInWarp];
			const PxReal appliedForce = constraint.mAppliedForce[threadIndexInWarp];
			const PxReal limitImpulseMultiplier = constraint.mLimitImpulseMultiplier[threadIndexInWarp];
			const PxReal limitAppliedForce = constraint.mLimitAppliedForce[threadIndexInWarp];

			const PxReal unclampedForce = ((velocity * velMultiplier + error * biasCoefficient)*recipCoefficient)
				+ appliedForce * impulseMultiplier;


			const PxReal unclampedForce2 = (limitError * limitBiasCoefficient * recipCoefficient)
				+ limitAppliedForce * limitImpulseMultiplier;

			const PxReal deltaF = ((unclampedForce - appliedForce) + (unclampedForce2 - limitAppliedForce));

			constraint.mAppliedForce[threadIndexInWarp] = unclampedForce;
			constraint.mLimitAppliedForce[threadIndexInWarp] = unclampedForce2;


			rootImp += deltaF;

			const Cm::UnAlignedSpatialVector cImpulse = loadSpatialVector(cLink.mScratchImpulse, threadIndexInWarp);

			const Cm::UnAlignedSpatialVector impulse = loadSpatialVector(constraint.mRow1, threadIndexInWarp) * -deltaF + cImpulse;

			//Store (impulse - YInt)
			//but YInt = constraint.mRow1 * -deltaF
			//so (impulse - YInt) = cImpulse.
			addSpatialVector(cLink.mSolverSpatialImpulse, cImpulse, threadIndexInWarp);

			Cm::UnAlignedSpatialVector propagatedImpulse = propagateImpulseW_0(c2p, artiDofs + jointOffset, impulse, dofCount, threadIndexInWarp);

			addSpatialVector(pLink.mScratchImpulse, propagatedImpulse, threadIndexInWarp);


		}


		const PxU32 firstConstraint = tendonJointBlock[firstChild].mConstraintId[threadIndexInWarp];
		const PxgArticulationInternalTendonConstraintData& constraint = constraintBlock[firstConstraint];

		const Cm::UnAlignedSpatialVector propagatedImpulse = loadSpatialVector(sLink.mScratchImpulse, threadIndexInWarp);

		Cm::UnAlignedSpatialVector Z = propagatedImpulse + loadSpatialVector(constraint.mRow0, threadIndexInWarp) * rootImp;


		for (PxU32 linkID = sLinkIndex; linkID; linkID = artiLinks[linkID].mParents[threadIndexInWarp])
		{
			PxgArticulationBlockLinkData& link = artiLinks[linkID];
			const PxU32 jointOffset = link.mJointOffset[threadIndexInWarp];
			const PxU32 dofCount = link.mDofs[threadIndexInWarp];

			const float rwx = link.mRw_x[threadIndexInWarp];
			const float rwy = link.mRw_y[threadIndexInWarp];
			const float rwz = link.mRw_z[threadIndexInWarp];

			const PxVec3 c2p(rwx, rwy, rwz);

			Z = propagateImpulseW_0(c2p, artiDofs + jointOffset, Z, dofCount, threadIndexInWarp);
		}

		addSpatialVector(blockData.mRootDeferredZ, Z, threadIndexInWarp);
	}

	if (numTendons > 0)
	{
		const PxU32 nbLinks = blockData.mNumLinks[threadIndexInWarp];
		for (PxU32 i = 0; i < nbLinks; ++i)
		{
			storeSpatialVector(artiLinks[i].mScratchImpulse, Cm::UnAlignedSpatialVector::Zero(), threadIndexInWarp);
		}
	}
}

static __device__ void solveInternalMimicJointConstraints(
	const PxgArticulationCoreDesc* PX_RESTRICT scDesc, const PxReal biasCoefficient, const PxReal dt, const PxReal recipDt,
	const bool isVelocityIteration, const bool isTGS, const PxU32 threadIndexInWarp)
{
	const PxU32 maxLinks = scDesc->mMaxLinksPerArticulation;
	const PxU32 wordSize = (maxLinks + 63) / 64;

	PxgArticulationBlockLinkData* artiLinks = scDesc->mArticulationLinkBlocks + maxLinks * blockIdx.x;
	PxgArticulationBlockDofData* artiDofs = scDesc->mArticulationDofBlocks + scDesc->mMaxDofsPerArticulation * blockIdx.x;

	PxgArticulationBitFieldData* artiLinkBitFields = scDesc->mPathToRootBitFieldBlocks + maxLinks * wordSize * blockIdx.x;

	const PxU32 maxMimicJoints = scDesc->mMaxMimicJointsPerArticulation;
	PxgArticulationBlockMimicJointData* artiMmicJoints = scDesc->mArticulationMimicJointBlocks + maxMimicJoints * blockIdx.x;

	PxgArticulationBlockData& artiBlock = scDesc->mArticulationBlocks[blockIdx.x];

	const bool fixBase = artiBlock.mFlags[threadIndexInWarp] & PxArticulationFlag::eFIX_BASE;

	const PxU32 numMimicJoints = artiBlock.mNumMimicJoints[threadIndexInWarp];

	for (PxU32 i = 0; i < numMimicJoints; ++i)
	{
		const PxgArticulationBlockMimicJointData& mimicJointData = artiMmicJoints[i];
		const PxU32 linkA = mimicJointData.mLinkA[threadIndexInWarp];
		const PxU32 linkB = mimicJointData.mLinkB[threadIndexInWarp];
		const PxU32 dofA = mimicJointData.mInternalData.mDofA[threadIndexInWarp];
		const PxU32 dofB = mimicJointData.mInternalData.mDofB[threadIndexInWarp];
		const PxReal mimicJointRecipEffectiveInertia = mimicJointData.mInternalData.recipEffectiveInertia[threadIndexInWarp];
		const PxReal gearRatio = mimicJointData.mGearRatio[threadIndexInWarp];
		const PxReal offset = mimicJointData.mOffset[threadIndexInWarp];
		const PxReal naturalFrequency = mimicJointData.mNaturalFrequency[threadIndexInWarp];
		const PxReal dampingRatio = mimicJointData.mDampingRatio[threadIndexInWarp];

		//Get the joint offsets.  We'll use these to gather the joint dof positions and speeds.
		const PxU32 jointOffsetA = artiLinks[linkA].mJointOffset[threadIndexInWarp];
		const PxU32 jointOffsetB = artiLinks[linkB].mJointOffset[threadIndexInWarp];

		//Get the joint positions.
		//We don't care if are using PGS or TGS because we can directly query the latest joint position in either case.
		const PxReal qA = artiDofs[jointOffsetA + dofA].mJointPositions[threadIndexInWarp];
		const PxReal qB = artiDofs[jointOffsetB + dofB].mJointPositions[threadIndexInWarp];

		//Get the joint speeds.
		PxReal qADot = 0.0f;
		PxReal qBDot = 0.0f;
		{
			PxReal jointDofSpeedsA[3] = {0, 0, 0};
			PxReal jointDofSpeedsB[3] = {0, 0, 0};
			pxcFsGetVelocity(artiBlock, artiLinks, artiDofs, artiLinkBitFields, wordSize, linkA, fixBase, jointDofSpeedsA, threadIndexInWarp);
			pxcFsGetVelocity(artiBlock, artiLinks, artiDofs, artiLinkBitFields, wordSize, linkB, fixBase, jointDofSpeedsB, threadIndexInWarp);
			qADot = jointDofSpeedsA[dofA];
			qBDot = jointDofSpeedsB[dofB];
		}

		//We've got everything we need to compute the joint impulses.
		PxReal jointImpulseA[3] = {0, 0, 0};
		PxReal jointImpulseB[3] = {0, 0, 0};
		{
			PxReal jointImpDofA = 0.0f;
			PxReal jointImpDofB = 0.0f;
			computeMimicJointImpulses(
				biasCoefficient, dt, recipDt,
				qA, qB, qADot, qBDot,
				gearRatio, offset, naturalFrequency, dampingRatio, mimicJointRecipEffectiveInertia,
				isVelocityIteration,
				jointImpDofA, jointImpDofB);
			jointImpulseA[dofA] = jointImpDofA;
			jointImpulseB[dofB] = jointImpDofB;
		}

		PxVec3 zero(0,0,0);
		pxcFsApplyImpulses(
			artiBlock, artiLinks, artiDofs,
			artiLinkBitFields, wordSize,
			linkA, zero, zero, jointImpulseA,
			linkB, zero, zero, jointImpulseB,
			threadIndexInWarp);
	}
}


extern "C" __global__
__launch_bounds__(WARP_SIZE, 12)
void artiSolveInternalTendonAndMimicJointConstraints1T(
const PxgArticulationCoreDesc* PX_RESTRICT scDesc, const PxReal biasCoefficient, const PxReal dt, const PxReal recipDt, const bool velocityIteration, const bool isTGS)
{
	const PxU32 nbArticulations = scDesc->nbArticulations;

	const PxU32 blockStride = blockDim.x;// / WARP_SIZE;

	//This identifies which warp a specific thread is in, we treat all warps in all blocks as a flatten warp array
	//and we are going to index the work based on that
	const PxU32 globalThreadIndex = blockIdx.x * blockStride + threadIdx.x;

	const PxU32 threadIndexInWarp = threadIdx.x;


	if (globalThreadIndex < nbArticulations)
	{
		PxgArticulationBlockData& blockData = scDesc->mArticulationBlocks[blockIdx.x];
		if (blockData.mNumSpatialTendons[threadIdx.x] || blockData.mNumFixedTendons[threadIdx.x] || blockData.mNumMimicJoints[threadIdx.x])
		{
			PxU32 dirtyFlag = blockData.mStateDirty[threadIdx.x];
			if (dirtyFlag & PxgArtiStateDirtyFlag::eHAS_IMPULSES)
			{
				const PxU32 numLinks = blockData.mNumLinks[threadIdx.x];
				PxgArticulationBlockLinkData* data = scDesc->mArticulationLinkBlocks + scDesc->mMaxLinksPerArticulation * blockIdx.x;
				PxgArticulationBlockDofData* dofData = scDesc->mArticulationDofBlocks + scDesc->mMaxDofsPerArticulation * blockIdx.x;
				averageLinkImpulsesAndPropagate(scDesc->slabHasChanges, scDesc->impulses, blockData, data, dofData, globalThreadIndex, scDesc->mMaxLinksPerArticulation,
					nbArticulations, scDesc->nbSlabs, numLinks, threadIdx.x);
			}

			blockData.mStateDirty[threadIdx.x] = PxgArtiStateDirtyFlag::eVEL_DIRTY;

			solveInternalSpatialConstraints(scDesc, isTGS, threadIndexInWarp);

			solveInternalFixedConstraints(scDesc, isTGS, threadIndexInWarp);

			solveInternalMimicJointConstraints(scDesc, biasCoefficient, dt, recipDt, velocityIteration, isTGS, threadIndexInWarp);
		}
	}
}


//two warp each block
extern "C" __global__
void artiSolveInternalConstraints1T(PxgArticulationCoreDesc* PX_RESTRICT scDesc, const PxReal dt,
	const PxReal invDt, const bool velocityIteration, const PxReal elapsedTime, const PxReal biasCoefficient,
	const PxU32* const PX_RESTRICT staticContactUniqueIds,
	const PxU32* const PX_RESTRICT staticJointUniqueIds,
	const PxgSolverSharedDesc<IterativeSolveData>* const PX_RESTRICT sharedDesc,
	bool doFriction,
	bool residualReportingEnabled,
	bool isExternalForcesEveryTgsIterationEnabled)
{
	// This kernel also resets articulation reference counts to zero after all usage.
	if(residualReportingEnabled)
		artiSolveInternalConstraints1T<IterativeSolveData, false, true>(scDesc, dt, invDt, elapsedTime, velocityIteration, staticContactUniqueIds, staticJointUniqueIds,
			sharedDesc, biasCoefficient, doFriction, isExternalForcesEveryTgsIterationEnabled);
	else
		artiSolveInternalConstraints1T<IterativeSolveData, false, false>(scDesc, dt, invDt, elapsedTime, velocityIteration, staticContactUniqueIds, staticJointUniqueIds,
			sharedDesc, biasCoefficient, doFriction, isExternalForcesEveryTgsIterationEnabled);
}

extern "C" __global__
void artiSolveInternalConstraintsTGS1T(PxgArticulationCoreDesc* PX_RESTRICT scDesc, const PxReal dt,
	const PxReal invDt, const bool velocityIteration, const PxReal elapsedTime, const PxReal biasCoefficient,
	const PxU32* const PX_RESTRICT staticContactUniqueIds,
	const PxU32* const PX_RESTRICT staticJointUniqueIds,
	const PxgSolverSharedDesc<IterativeSolveDataTGS>* const PX_RESTRICT sharedDesc,
	bool doFriction, bool residualReportingEnabled,
	bool isExternalForceEveryStep)
{
	// This kernel also resets articulation reference counts to zero after all usage.
	const PxReal erp = PxMin(0.7f, biasCoefficient);
	if(residualReportingEnabled)
		artiSolveInternalConstraints1T<IterativeSolveDataTGS, true, true>(scDesc, dt, invDt, elapsedTime, velocityIteration, staticContactUniqueIds, staticJointUniqueIds,
			sharedDesc, erp, doFriction, isExternalForceEveryStep);
	else
		artiSolveInternalConstraints1T<IterativeSolveDataTGS, true, false>(scDesc, dt, invDt, elapsedTime, velocityIteration, staticContactUniqueIds, staticJointUniqueIds,
			sharedDesc, erp, doFriction, isExternalForceEveryStep);
}


//each block has 16 warps, each warp has 32 threads, 32 blocks
static __device__ void artiSumInternalContactAndJointBatches1(const PxU32* const PX_RESTRICT staticContactCount, const PxU32* const PX_RESTRICT staticJointCount,
	const PxU32* const PX_RESTRICT selfContactCounts, const PxU32* const PX_RESTRICT selfJointCounts,
	const PxU32 nbArticulations, PxU32* tempStaticContactUniqueIndicesBlockSum, PxU32* tempStaticJointUniqueIndicesBlockSum,
	PxU32* tempStaticContactHeaderBlockSum, PxU32* tempStaticJointHeaderBlockSum,
	PxU32* tempSelfContactUniqueIndicesBlockSum, PxU32* tempSelfJointUniqueIndicesBlockSum,
	PxU32* tempSelfContactHeaderBlockSum, PxU32* tempSelfJointHeaderBlockSum)
{

	const PxU32 numThreadsPerBlock = PxgArticulationCoreKernelBlockDim::COMPUTE_STATIC_CONTACT_CONSTRAINT_COUNT;

	const PxU32 warpPerBlock = numThreadsPerBlock / WARP_SIZE;

	const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE - 1);
	const PxU32 warpIndex = threadIdx.x / WARP_SIZE;

	const PxU32 block_size = 32;

	const PxU32 totalBlockRequired = (nbArticulations + (numThreadsPerBlock - 1)) / numThreadsPerBlock;

	const PxU32 numIterationPerBlock = (totalBlockRequired + (block_size - 1)) / block_size;


	__shared__ PxU32 shContactUniqueIndicesWarpSum[warpPerBlock];
	__shared__ PxU32 shJointUniqueIndicesWarpSum[warpPerBlock];
	__shared__ PxU32 shContactHeaderWarpSum[warpPerBlock];
	__shared__ PxU32 shJointHeaderWarpSum[warpPerBlock];

	__shared__ PxU32 sContactUniqueIndicesAccum;
	__shared__ PxU32 sJointUniqueIndicesAccum;
	__shared__ PxU32 sContactHeaderAccum;
	__shared__ PxU32 sJointHeaderAccum;


	__shared__ PxU32 shSelfContactUniqueIndicesWarpSum[warpPerBlock];
	__shared__ PxU32 shSelfJointUniqueIndicesWarpSum[warpPerBlock];
	__shared__ PxU32 shSelfContactHeaderWarpSum[warpPerBlock];
	__shared__ PxU32 shSelfJointHeaderWarpSum[warpPerBlock];

	__shared__ PxU32 sSelfContactUniqueIndicesAccum;
	__shared__ PxU32 sSelfJointUniqueIndicesAccum;
	__shared__ PxU32 sSelfContactHeaderAccum;
	__shared__ PxU32 sSelfJointHeaderAccum;


	if (threadIdx.x == (WARP_SIZE - 1))
	{
		sContactUniqueIndicesAccum = 0;
		sJointUniqueIndicesAccum = 0;
		sContactHeaderAccum = 0;
		sJointHeaderAccum = 0;
		sSelfContactUniqueIndicesAccum = 0;
		sSelfJointUniqueIndicesAccum = 0;
		sSelfContactHeaderAccum = 0;
		sSelfJointHeaderAccum = 0;
	}

	__syncthreads();

	for (PxU32 i = 0; i < numIterationPerBlock; ++i)
	{
		const PxU32 workIndex = i * blockDim.x + threadIdx.x + numIterationPerBlock * blockIdx.x * blockDim.x;

		PxU32 contactCount = 0;
		PxU32 jointCount = 0;
		PxU32 selfContactCount = 0;
		PxU32 selfJointCount = 0;

		if (workIndex < nbArticulations)
		{
			contactCount = staticContactCount[workIndex];
			jointCount = staticJointCount[workIndex];

			selfContactCount = selfContactCounts[workIndex];
			selfJointCount = selfJointCounts[workIndex];
		}


		PxU32 maxContact = contactCount;
		PxU32 maxJoint = jointCount;
		PxU32 maxSelfContact = selfContactCount;
		PxU32 maxSelfJoint = selfJointCount;

		contactCount = warpReduction<AddOpPxU32, PxU32>(FULL_MASK, contactCount);
		jointCount = warpReduction<AddOpPxU32, PxU32>(FULL_MASK, jointCount);
		maxContact = warpReduction<MaxOpPxU32, PxU32>(FULL_MASK, maxContact);
		maxJoint = warpReduction<MaxOpPxU32, PxU32>(FULL_MASK, maxJoint);

		selfContactCount = warpReduction<AddOpPxU32, PxU32>(FULL_MASK, selfContactCount);
		selfJointCount = warpReduction<AddOpPxU32, PxU32>(FULL_MASK, selfJointCount);
		maxSelfContact = warpReduction<MaxOpPxU32, PxU32>(FULL_MASK, maxSelfContact);
		maxSelfJoint = warpReduction<MaxOpPxU32, PxU32>(FULL_MASK, maxSelfJoint);

		if (threadIndexInWarp == (WARP_SIZE - 1))
		{
			shContactUniqueIndicesWarpSum[warpIndex] = contactCount;
			shJointUniqueIndicesWarpSum[warpIndex] = jointCount;
			shContactHeaderWarpSum[warpIndex] = maxContact;
			shJointHeaderWarpSum[warpIndex] = maxJoint;

			shSelfContactUniqueIndicesWarpSum[warpIndex] = selfContactCount;
			shSelfJointUniqueIndicesWarpSum[warpIndex] = selfJointCount;
			shSelfContactHeaderWarpSum[warpIndex] = maxSelfContact;
			shSelfJointHeaderWarpSum[warpIndex] = maxSelfJoint;
		}


		__syncthreads();

		unsigned mask_idx = __ballot_sync(FULL_MASK, threadIndexInWarp < warpPerBlock);

		if (threadIdx.x < warpPerBlock)
		{
			PxU32 contactUniqueIndicesWarpSum = shContactUniqueIndicesWarpSum[threadIndexInWarp];
			PxU32 jointUniqueIndicesWarpSum = shJointUniqueIndicesWarpSum[threadIndexInWarp];
			PxU32 contactHeaderWarpSum = shContactHeaderWarpSum[threadIndexInWarp];
			PxU32 jointHeaderWarpSum = shJointHeaderWarpSum[threadIndexInWarp];

			PxU32 selfContactUniqueIndicesWarpSum = shSelfContactUniqueIndicesWarpSum[threadIndexInWarp];
			PxU32 selfJointUniqueIndicesWarpSum = shSelfJointUniqueIndicesWarpSum[threadIndexInWarp];
			PxU32 selfContactHeaderWarpSum = shSelfContactHeaderWarpSum[threadIndexInWarp];
			PxU32 selfJointHeaderWarpSum = shSelfJointHeaderWarpSum[threadIndexInWarp];

			contactUniqueIndicesWarpSum = warpReduction<AddOpPxU32, PxU32>(mask_idx, contactUniqueIndicesWarpSum);
			jointUniqueIndicesWarpSum = warpReduction<AddOpPxU32, PxU32>(mask_idx, jointUniqueIndicesWarpSum);
			contactHeaderWarpSum = warpReduction<AddOpPxU32, PxU32>(mask_idx, contactHeaderWarpSum);
			jointHeaderWarpSum = warpReduction<AddOpPxU32, PxU32>(mask_idx, jointHeaderWarpSum);

			selfContactUniqueIndicesWarpSum = warpReduction<AddOpPxU32, PxU32>(mask_idx, selfContactUniqueIndicesWarpSum);
			selfJointUniqueIndicesWarpSum = warpReduction<AddOpPxU32, PxU32>(mask_idx, selfJointUniqueIndicesWarpSum);
			selfContactHeaderWarpSum = warpReduction<AddOpPxU32, PxU32>(mask_idx, selfContactHeaderWarpSum);
			selfJointHeaderWarpSum = warpReduction<AddOpPxU32, PxU32>(mask_idx, selfJointHeaderWarpSum);


			if (threadIdx.x == (warpPerBlock - 1))
			{
				sContactUniqueIndicesAccum += contactUniqueIndicesWarpSum;
				sJointUniqueIndicesAccum += jointUniqueIndicesWarpSum;
				sContactHeaderAccum += contactHeaderWarpSum;
				sJointHeaderAccum += jointHeaderWarpSum;

				sSelfContactUniqueIndicesAccum += selfContactUniqueIndicesWarpSum;
				sSelfJointUniqueIndicesAccum += selfJointUniqueIndicesWarpSum;
				sSelfContactHeaderAccum += selfContactHeaderWarpSum;
				sSelfJointHeaderAccum += selfJointHeaderWarpSum;
			}
		}

		__syncthreads();

	}

	if (threadIdx.x == (warpPerBlock - 1))
	{
		tempStaticContactUniqueIndicesBlockSum[blockIdx.x] = sContactUniqueIndicesAccum;
		tempStaticJointUniqueIndicesBlockSum[blockIdx.x] = sJointUniqueIndicesAccum;
		tempStaticContactHeaderBlockSum[blockIdx.x] = sContactHeaderAccum;
		tempStaticJointHeaderBlockSum[blockIdx.x] = sJointHeaderAccum;

		tempSelfContactUniqueIndicesBlockSum[blockIdx.x] = sSelfContactUniqueIndicesAccum;
		tempSelfJointUniqueIndicesBlockSum[blockIdx.x] = sSelfJointUniqueIndicesAccum;
		tempSelfContactHeaderBlockSum[blockIdx.x] = sSelfContactHeaderAccum;
		tempSelfJointHeaderBlockSum[blockIdx.x] = sSelfJointHeaderAccum;
	}
}


static __device__ void artiSumInternalContactAndJointBatches2(
	const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
	PxgSolverCoreDesc* PX_RESTRICT solverCoreDesc,
	const PxU32* const PX_RESTRICT staticContactCount,
	const PxU32* const PX_RESTRICT staticJointCount,
	PxgConstraintBatchHeader* PX_RESTRICT batchHeaders,
	const PxU32* const PX_RESTRICT contactStaticUniqueIds,
	const PxU32* const PX_RESTRICT jointStaticUniqueIndices,
	PartitionNodeData* const PX_RESTRICT pNodeData,
	const PxU32 nbArticulations,
	PxU32* tempStaticContactUniqueIndicesBlock,
	PxU32* tempStaticJointUniqueIndicesBlock,
	PxU32* tempStaticContactHeaderBlock,
	PxU32* tempStaticJointHeaderBlock,
	PxU32 articulationStaticContactBatchOffset,
	PxU32 articulationStaticJointBatchOffset,
	PxU32 articulationBatchOffset,
	PxU32 contactUniqueIndexOffset,
	PxU32 jointUniqueIndexOffset,
	PxU32* outContactUniqueIds,
	PxU32* outJointUniqueIndices,
	PxU32& outNumArtiStaticBatches,
	PxgConstraintPrepareDesc* constraintPrepDesc,
	const PxU32 numRigidBatches,
	const PxU32 numArtiContactBatches,
	const PxU32 numArtiJointBatches,
	const PxU32 numRigidContacts,
	const PxU32 numRigidJoints,
	const PxU32 numRigidStaticContacts,
	const PxU32 numRigidStaticJoints)
{

	const PxU32 numThreadsPerBlock = PxgArticulationCoreKernelBlockDim::COMPUTE_STATIC_CONTACT_CONSTRAINT_COUNT;

	const PxU32 warpPerBlock = numThreadsPerBlock / WARP_SIZE;

	const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE - 1);

	const PxU32 warpIndex = threadIdx.x / WARP_SIZE;

	const PxU32 block_size = 32; // 32 blocks

	__shared__ PxU32 shContactUniqueIndicesWarpSum[warpPerBlock];
	__shared__ PxU32 shJointUniqueIndicesWarpSum[warpPerBlock];
	__shared__ PxU32 shContactHeaderWarpSum[warpPerBlock];
	__shared__ PxU32 shJointHeaderWarpSum[warpPerBlock];

	__shared__ PxU32 sContactUniqueIndicesBlockHistogram[block_size];
	__shared__ PxU32 sJointUniqueIndicesBlockHistogram[block_size];
	__shared__ PxU32 sContactHeaderBlockHistogram[block_size];
	__shared__ PxU32 sJointHeaderBlockHistogram[block_size];

	__shared__ PxU32 sContactUniqueIndicesAccum;
	__shared__ PxU32 sJointUniqueIndicesAccum;
	__shared__ PxU32 sContactHeaderAccum;
	__shared__ PxU32 sJointHeaderAccum;

	PxU32* artiJointConstraintBatchIndices = constraintPrepDesc->artiJointConstraintBatchIndices;
	PxU32* artiContactConstraintBatchIndices = constraintPrepDesc->artiContactConstraintBatchIndices;


	if (threadIdx.x == (WARP_SIZE - 1))
	{
		sContactUniqueIndicesAccum = 0;
		sJointUniqueIndicesAccum = 0;
		sContactHeaderAccum = 0;
		sJointHeaderAccum = 0;
	}


	//accumulate num pairs per block and compute exclusive run sum
	//unsigned mask_idx = __ballot_sync(FULL_MASK, threadIndexInWarp < block_size);
	if (warpIndex == 0/* && threadIndexInWarp < block_size*/)
	{
		const PxU32 oriContactUniqueIndiceOffset = tempStaticContactUniqueIndicesBlock[threadIndexInWarp];
		const PxU32 oriJointUniqueIndiceOffset = tempStaticJointUniqueIndicesBlock[threadIndexInWarp];
		const PxU32 oriContactHeaderOffset = tempStaticContactHeaderBlock[threadIndexInWarp];
		const PxU32 oriJointHeaderOffset = tempStaticJointHeaderBlock[threadIndexInWarp];

		const PxU32 contactUniqueIndiceOffset = warpScan<AddOpPxU32, PxU32>(FULL_MASK, oriContactUniqueIndiceOffset);
		const PxU32 jointUniqueIndiceOffset = warpScan<AddOpPxU32, PxU32>(FULL_MASK, oriJointUniqueIndiceOffset) ;
		const PxU32 contactHeaderOffset = warpScan<AddOpPxU32, PxU32>(FULL_MASK, oriContactHeaderOffset);
		const PxU32 jointHeaderOffset = warpScan<AddOpPxU32, PxU32>(FULL_MASK, oriJointHeaderOffset);
		//store exclusive run sum
		sContactUniqueIndicesBlockHistogram[threadIndexInWarp] = contactUniqueIndiceOffset - oriContactUniqueIndiceOffset;
		sJointUniqueIndicesBlockHistogram[threadIndexInWarp] = jointUniqueIndiceOffset - oriJointUniqueIndiceOffset;
		sContactHeaderBlockHistogram[threadIndexInWarp] = contactHeaderOffset - oriContactHeaderOffset;
		sJointHeaderBlockHistogram[threadIndexInWarp] = jointHeaderOffset - oriJointHeaderOffset;

		if (blockIdx.x == 0 && threadIdx.x == (WARP_SIZE - 1))
		{
			//Output total number of articulation static blocks
			const PxU32 totalNumArtiStaticBatches = contactHeaderOffset + jointHeaderOffset;
			outNumArtiStaticBatches = totalNumArtiStaticBatches;

			constraintPrepDesc->numArtiStaticContactBatches = contactHeaderOffset;
			constraintPrepDesc->numArtiStatic1dConstraintBatches = jointHeaderOffset;

			PxgIslandContext& island = solverCoreDesc->islandContextPool[0];
			island.mStaticArtiBatchCount = totalNumArtiStaticBatches;
		}

	}

	__syncthreads();

	//We now have the exclusive runsum for this block. Next step is to recompute the local
	//offsets within the block and output data...

	const PxU32 totalBlockRequired = (nbArticulations + (blockDim.x - 1)) / blockDim.x;

	const PxU32 numIterationPerBlock = (totalBlockRequired + (block_size - 1)) / block_size;

	for (PxU32 a = 0; a < numIterationPerBlock; ++a)
	{
		const PxU32 workIndex = a * blockDim.x + threadIdx.x + numIterationPerBlock * blockIdx.x * blockDim.x;

		PxU32 contactCount = 0;
		PxU32 jointCount = 0;

		PxgArticulationBlockLinkData* data = scDesc->mArticulationLinkBlocks + scDesc->mMaxLinksPerArticulation * (workIndex / WARP_SIZE);

		if (workIndex < nbArticulations)
		{
			contactCount = staticContactCount[workIndex];
			jointCount = staticJointCount[workIndex];
		}

		//we need to use contactCount and jointCount later
		PxU32 sumContact = contactCount;
		PxU32 sumJoint = jointCount;
		PxU32 maxContact = contactCount;
		PxU32 maxJoint = jointCount;

		sumContact = warpReduction<AddOpPxU32, PxU32>(FULL_MASK, sumContact);
		sumJoint = warpReduction<AddOpPxU32, PxU32>(FULL_MASK, sumJoint);
		maxContact = warpReduction<MaxOpPxU32, PxU32>(FULL_MASK, maxContact);
		maxJoint = warpReduction<MaxOpPxU32, PxU32>(FULL_MASK, maxJoint);


		if (threadIndexInWarp == 31)
		{
			shContactUniqueIndicesWarpSum[warpIndex] = sumContact;
			shJointUniqueIndicesWarpSum[warpIndex] = sumJoint;
			shContactHeaderWarpSum[warpIndex] = maxContact;
			shJointHeaderWarpSum[warpIndex] = maxJoint;
		}

		__syncthreads();

		PxU32 contactWarpOffset = 0;
		PxU32 jointWarpOffset = 0;
		PxU32 contactBlockWarpOffset = 0;
		PxU32 jointBlockWarpOffset = 0;

		PxU32 contactUniqueIndicesWarpSum = 0;
		PxU32 jointUniqueIndicesWarpSum = 0;
		PxU32 contactHeaderWarpSum = 0;
		PxU32 jointHeaderWarpSum = 0;

		unsigned mask_idx = __ballot_sync(FULL_MASK, threadIndexInWarp < warpPerBlock);

		//warpPerBlock should be less than 32, each warp will do the runsum
		if (threadIndexInWarp < warpPerBlock)
		{

			const PxU32 oriContactUniqueIndicesWarpSum = shContactUniqueIndicesWarpSum[threadIndexInWarp];
			const PxU32 oriJointUniqueIndicesWarpSum = shJointUniqueIndicesWarpSum[threadIndexInWarp];
			const PxU32 oriContactHeaderWarpSum = shContactHeaderWarpSum[threadIndexInWarp];
			const PxU32 oriJointHeaderWarpSum = shJointHeaderWarpSum[threadIndexInWarp];

			contactUniqueIndicesWarpSum = warpScan<AddOpPxU32, PxU32>(mask_idx, oriContactUniqueIndicesWarpSum);
			jointUniqueIndicesWarpSum = warpScan<AddOpPxU32, PxU32>(mask_idx, oriJointUniqueIndicesWarpSum);
			contactHeaderWarpSum = warpScan<AddOpPxU32, PxU32>(mask_idx, oriContactHeaderWarpSum);
			jointHeaderWarpSum = warpScan<AddOpPxU32, PxU32>(mask_idx, oriJointHeaderWarpSum);


			//exclusive runsum
			contactWarpOffset = contactUniqueIndicesWarpSum - oriContactUniqueIndicesWarpSum;
			jointWarpOffset = jointUniqueIndicesWarpSum - oriJointUniqueIndicesWarpSum;
			contactBlockWarpOffset = contactHeaderWarpSum - oriContactHeaderWarpSum;
			jointBlockWarpOffset = jointHeaderWarpSum - oriJointHeaderWarpSum;


		}

		//make sure each thread in a warp has the correct warp offset
		contactWarpOffset = __shfl_sync(FULL_MASK, contactWarpOffset, warpIndex);
		jointWarpOffset = __shfl_sync(FULL_MASK, jointWarpOffset, warpIndex);
		contactBlockWarpOffset = __shfl_sync(FULL_MASK, contactBlockWarpOffset, warpIndex);
		jointBlockWarpOffset = __shfl_sync(FULL_MASK, jointBlockWarpOffset, warpIndex);

		//OK. We finally have enough information to figure out where to write the blocks related to this
		//articulation!

		//Where the contact uniqueIds should go. This is the start of where this warp should write.
		//The uids will be interleaved depending on the number of constraints in a contact block
		//contactUniqueIndexOffset : articulation static contact start offset
		//sContactUniqueIndicesAccum :: accumulation from the previous iterations
		PxU32 contactOffset = contactUniqueIndexOffset + contactWarpOffset + sContactUniqueIndicesBlockHistogram[blockIdx.x] + sContactUniqueIndicesAccum;
		//Where the joint unique Ids should go. See above for explanation
		PxU32 jointOffset = jointUniqueIndexOffset + jointWarpOffset + sJointUniqueIndicesBlockHistogram[blockIdx.x] + sJointUniqueIndicesAccum;
		//Where the blocks should go. Shared between all threads in a block
		PxU32 contactBlockOffset = contactBlockWarpOffset + sContactHeaderBlockHistogram[blockIdx.x] + sContactHeaderAccum ;
		PxU32 jointBlockOffset = jointBlockWarpOffset + sJointHeaderBlockHistogram[blockIdx.x] + sJointHeaderAccum;

		PxU32 blockOffset = contactBlockOffset + jointBlockOffset + articulationBatchOffset;


		//we have to sync in here so all the threads in a warp has finished reading sContactUniqueIndicesAccum, sJointUniqueIndicesAccum,
		//sContactHeaderAccum, sJointHeaderAccum before we overwrite those values for another iterations
		__syncthreads();

		if (threadIdx.x == (warpPerBlock - 1))
		{
			sContactUniqueIndicesAccum += contactUniqueIndicesWarpSum;
			sJointUniqueIndicesAccum += jointUniqueIndicesWarpSum;
			sContactHeaderAccum += contactHeaderWarpSum;
			sJointHeaderAccum += jointHeaderWarpSum;
		}

		__syncthreads();


		for (PxU32 i = 0; i < maxContact; ++i)
		{
			PxU32 mask = __ballot_sync(FULL_MASK, contactCount > i);

			const PxU32 stride = __popc(mask);
			const PxU32 offset = warpScanExclusive(mask, threadIndexInWarp);

			if (contactCount > i)
			{
				PxU32 contactUniqueId = contactStaticUniqueIds[workIndex + nbArticulations * i];
				outContactUniqueIds[contactOffset + offset] = contactUniqueId;

				const PartitionNodeData& nodeData = pNodeData[contactUniqueId];
				PxNodeIndex igNodeIndexA = nodeData.mNodeIndex0;
				PxNodeIndex igNodeIndexB = nodeData.mNodeIndex1;

				if (igNodeIndexA.isArticulation())
				{
					const PxU32 artiLinkID = igNodeIndexA.articulationLinkId();

					if (data[artiLinkID].mNbStaticContacts[threadIndexInWarp] == 0)
					{
						data[artiLinkID].mStaticContactStartIndex[threadIndexInWarp] = blockOffset;
					}
					data[artiLinkID].mNbStaticContacts[threadIndexInWarp]++;
				}

				if (igNodeIndexB.isArticulation())
				{
					const PxU32 artiLinkID = igNodeIndexB.articulationLinkId();


					if (data[artiLinkID].mNbStaticContacts[threadIndexInWarp] == 0)
					{
						data[artiLinkID].mStaticContactStartIndex[threadIndexInWarp] = blockOffset;
					}
					data[artiLinkID].mNbStaticContacts[threadIndexInWarp]++;
				}
			}

			if (threadIndexInWarp == 0)
			{
				const PxU32 batchIndex = contactBlockOffset + i + articulationStaticContactBatchOffset;
				PxgConstraintBatchHeader header;
				header.mDescStride = stride;
				header.constraintType = PxgSolverConstraintDesc::eARTICULATION_CONTACT;
				header.mConstraintBatchIndex = batchIndex;
				header.mStartPartitionIndex = contactOffset - numRigidContacts - numRigidStaticContacts;
				header.mask = mask;
				batchHeaders[blockOffset] = header;
				artiContactConstraintBatchIndices[contactBlockOffset + numArtiContactBatches + i] = blockOffset - numRigidBatches;
			}

			contactOffset += stride;
			blockOffset++;
		}

		for (PxU32 i = 0; i < maxJoint; ++i)
		{
			PxU32 mask = __ballot_sync(FULL_MASK, jointCount > i);

			const PxU32 stride = __popc(mask);
			const PxU32 offset = warpScanExclusive(mask, threadIndexInWarp);

			if (jointCount > i)
			{
				PxU32 jointUniqueId = jointStaticUniqueIndices[workIndex + nbArticulations * i];
				outJointUniqueIndices[jointOffset + offset] = jointUniqueId;

				const PartitionNodeData& nodeData = pNodeData[jointUniqueId];
				PxNodeIndex igNodeIndexA = nodeData.mNodeIndex0;
				PxNodeIndex igNodeIndexB = nodeData.mNodeIndex1;

				if (igNodeIndexA.isArticulation())
				{
					const PxU32 artiLinkID = igNodeIndexA.articulationLinkId();
					if (data[artiLinkID].mNbStaticJoints[threadIndexInWarp] == 0)
					{
						data[artiLinkID].mStaticJointStartIndex[threadIndexInWarp] = blockOffset;
					}
					data[artiLinkID].mNbStaticJoints[threadIndexInWarp]++;
				}

				if (igNodeIndexB.isArticulation())
				{
					const PxU32 artiLinkID = igNodeIndexB.articulationLinkId();
					if (data[artiLinkID].mNbStaticJoints[threadIndexInWarp] == 0)
					{
						data[artiLinkID].mStaticJointStartIndex[threadIndexInWarp] = blockOffset;
					}
					data[artiLinkID].mNbStaticJoints[threadIndexInWarp]++;
				}
			}

			if (threadIndexInWarp == 0)
			{
				PxgConstraintBatchHeader header;
				header.mDescStride = stride;
				const PxU32 batchIndex = jointBlockOffset + i + articulationStaticJointBatchOffset;
				header.constraintType = PxgSolverConstraintDesc::eARTICULATION_CONSTRAINT_1D;
				header.mConstraintBatchIndex = batchIndex;
				header.mStartPartitionIndex = jointOffset - numRigidJoints - numRigidStaticJoints;
				header.mask = mask;
				batchHeaders[blockOffset] = header;
				artiJointConstraintBatchIndices[jointBlockOffset + numArtiJointBatches + i] = blockOffset - numRigidBatches;
			}

			jointOffset += stride;
			blockOffset++;
		}
	}
}


extern "C" __global__
__launch_bounds__(PxgArticulationCoreKernelBlockDim::COMPUTE_STATIC_CONTACT_CONSTRAINT_COUNT, 1)
void artiSumInternalContactAndJointBatches1Launch(
	const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
	PxgPrePrepDesc* prePrepDesc,
	const PxU32 nbArticulations)
{
	artiSumInternalContactAndJointBatches1(
		prePrepDesc->mArtiStaticContactCounts,
		prePrepDesc->mArtiStaticConstraintCounts,
		prePrepDesc->mArtiSelfContactCounts,
		prePrepDesc->mArtiSelfConstraintCounts,
		nbArticulations, scDesc->mTempContactUniqueIndicesBlock, scDesc->mTempConstraintUniqueIndicesBlock,
		scDesc->mTempContactHeaderBlock, scDesc->mTempConstraintHeaderBlock,
		scDesc->mTempSelfContactUniqueIndicesBlock, scDesc->mTempSelfConstraintUniqueIndicesBlock,
		scDesc->mTempSelfContactHeaderBlock, scDesc->mTempSelfConstraintHeaderBlock);
}


extern "C" __global__
__launch_bounds__(PxgArticulationCoreKernelBlockDim::COMPUTE_STATIC_CONTACT_CONSTRAINT_COUNT, 1)
void artiSumInternalContactAndJointBatches2Launch(
	const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
	PxgSolverCoreDesc* PX_RESTRICT solverCoreDesc,
	PxgPrePrepDesc* PX_RESTRICT prePrepDesc,
	PxgConstraintPrepareDesc* PX_RESTRICT constraintPrepDesc,
	const PxU32 nbArticulations
	)
{

	const PxU32 numRigidContacts = prePrepDesc->numTotalContacts;
	const PxU32 numRigidJoints = prePrepDesc->numTotalConstraints;
	const PxU32 articulationBatchOffset = prePrepDesc->numBatches + prePrepDesc->numArtiBatches;
	const PxU32 contactUniqueIndexOffset = numRigidContacts + prePrepDesc->numTotalStaticContacts + prePrepDesc->numTotalArtiContacts;
	const PxU32 jointUniqueIndexOffset = numRigidJoints + prePrepDesc->numTotalStaticConstraints + prePrepDesc->numTotalArtiConstraints;

	const PxU32 artiStaticContactBatchOffset = prePrepDesc->artiStaticContactBatchOffset;
	const PxU32 artiStaticConstraintBatchOffset = prePrepDesc->artiStaticConstraintBatchOffset;

	artiSumInternalContactAndJointBatches2(
		scDesc,
		solverCoreDesc,
		prePrepDesc->mArtiStaticContactCounts,
		prePrepDesc->mArtiStaticConstraintCounts,
		prePrepDesc->mBatchHeaders,
		prePrepDesc->mArtiStaticContactIndices,     //stride static contacts(nbArticulations)
		prePrepDesc->mArtiStaticConstraintIndices,  //stride external constraints(nbArticulations)
		prePrepDesc->mPartitionNodeData,
		nbArticulations,
		scDesc->mTempContactUniqueIndicesBlock,
		scDesc->mTempConstraintUniqueIndicesBlock,
		scDesc->mTempContactHeaderBlock,
		scDesc->mTempConstraintHeaderBlock,
		artiStaticContactBatchOffset,
		artiStaticConstraintBatchOffset,
		articulationBatchOffset,
		contactUniqueIndexOffset,
		jointUniqueIndexOffset,
		prePrepDesc->mContactUniqueIndices,
		prePrepDesc->mConstraintUniqueIndices,
		prePrepDesc->numArtiStaticBatches,
		constraintPrepDesc,
		constraintPrepDesc->numBatches,
		constraintPrepDesc->numArtiContactBatches,
		constraintPrepDesc->numArti1dConstraintBatches,
		numRigidContacts,
		numRigidJoints,
		prePrepDesc->numTotalStaticContacts,
		prePrepDesc->numTotalStaticConstraints
	);
}

extern "C" __global__
__launch_bounds__(PxgArticulationCoreKernelBlockDim::COMPUTE_STATIC_CONTACT_CONSTRAINT_COUNT, 1)
void artiSumSelfContactAndJointBatches(
	const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
	PxgSolverCoreDesc* PX_RESTRICT solverCoreDesc,
	PxgPrePrepDesc* PX_RESTRICT prePrepDesc,
	PxgConstraintPrepareDesc* PX_RESTRICT constraintPrepDesc,
	const PxU32 nbArticulations
)
{
	//The first stage of this ran with the static contact code.
	//We output self-contacts *after* static contacts in the batches.
	//We don't need to sum up the number of batches per link. We can instead
	//process just in a single iteration, making this code simpler.

	PxU32* tempSelfContactUniqueIndicesBlock = scDesc->mTempSelfContactUniqueIndicesBlock;
	PxU32* tempSelfJointUniqueIndicesBlock = scDesc->mTempSelfConstraintUniqueIndicesBlock;
	PxU32* tempSelfContactHeaderBlock = scDesc->mTempSelfContactHeaderBlock;
	PxU32* tempSelfJointHeaderBlock = scDesc->mTempSelfConstraintHeaderBlock;

	const PxU32 numRigidContacts = prePrepDesc->numTotalContacts;
	const PxU32 numRigidJoints = prePrepDesc->numTotalConstraints;

	const PxU32 numRigidStaticContacts = prePrepDesc->numTotalStaticContacts;
	const PxU32 numRigidStaticJoints = prePrepDesc->numTotalStaticConstraints;

	const PxU32 articulationBatchOffset = prePrepDesc->numBatches + prePrepDesc->numArtiBatches + prePrepDesc->numArtiStaticBatches;
	const PxU32 contactUniqueIndexOffset = numRigidContacts + prePrepDesc->numTotalStaticContacts + prePrepDesc->numTotalArtiContacts + prePrepDesc->numTotalStaticArtiContacts;
	const PxU32 jointUniqueIndexOffset = numRigidJoints + prePrepDesc->numTotalStaticConstraints + prePrepDesc->numTotalArtiConstraints + prePrepDesc->numTotalStaticArtiConstraints;


	const PxU32 numThreadsPerBlock = PxgArticulationCoreKernelBlockDim::COMPUTE_STATIC_CONTACT_CONSTRAINT_COUNT;

	const PxU32 warpPerBlock = numThreadsPerBlock / WARP_SIZE;

	const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE - 1);

	const PxU32 warpIndex = threadIdx.x / WARP_SIZE;

	const PxU32 block_size = 32; // 32 blocks

	__shared__ PxU32 shContactUniqueIndicesWarpSum[warpPerBlock];
	__shared__ PxU32 shJointUniqueIndicesWarpSum[warpPerBlock];
	__shared__ PxU32 shContactHeaderWarpSum[warpPerBlock];
	__shared__ PxU32 shJointHeaderWarpSum[warpPerBlock];

	__shared__ PxU32 sContactUniqueIndicesBlockHistogram[block_size];
	__shared__ PxU32 sJointUniqueIndicesBlockHistogram[block_size];
	__shared__ PxU32 sContactHeaderBlockHistogram[block_size];
	__shared__ PxU32 sJointHeaderBlockHistogram[block_size];

	__shared__ PxU32 sContactUniqueIndicesAccum;
	__shared__ PxU32 sJointUniqueIndicesAccum;
	__shared__ PxU32 sContactHeaderAccum;
	__shared__ PxU32 sJointHeaderAccum;

	PxU32* artiJointConstraintBatchIndices = constraintPrepDesc->artiJointConstraintBatchIndices;
	PxU32* artiContactConstraintBatchIndices = constraintPrepDesc->artiContactConstraintBatchIndices;


	if (threadIdx.x == (WARP_SIZE - 1))
	{
		sContactUniqueIndicesAccum = 0;
		sJointUniqueIndicesAccum = 0;
		sContactHeaderAccum = 0;
		sJointHeaderAccum = 0;
	}


	//accumulate num pairs per block and compute exclusive run sum
	//unsigned mask_idx = __ballot_sync(FULL_MASK, threadIndexInWarp < block_size);
	if (warpIndex == 0/* && threadIndexInWarp < block_size*/)
	{
		const PxU32 oriContactUniqueIndiceOffset = tempSelfContactUniqueIndicesBlock[threadIndexInWarp];
		const PxU32 oriJointUniqueIndiceOffset = tempSelfJointUniqueIndicesBlock[threadIndexInWarp];
		const PxU32 oriContactHeaderOffset = tempSelfContactHeaderBlock[threadIndexInWarp];
		const PxU32 oriJointHeaderOffset = tempSelfJointHeaderBlock[threadIndexInWarp];

		const PxU32 contactUniqueIndiceOffset = warpScan<AddOpPxU32, PxU32>(FULL_MASK, oriContactUniqueIndiceOffset);
		const PxU32 jointUniqueIndiceOffset = warpScan<AddOpPxU32, PxU32>(FULL_MASK, oriJointUniqueIndiceOffset);
		const PxU32 contactHeaderOffset = warpScan<AddOpPxU32, PxU32>(FULL_MASK, oriContactHeaderOffset);
		const PxU32 jointHeaderOffset = warpScan<AddOpPxU32, PxU32>(FULL_MASK, oriJointHeaderOffset);
		//store exclusive run sum
		sContactUniqueIndicesBlockHistogram[threadIndexInWarp] = contactUniqueIndiceOffset - oriContactUniqueIndiceOffset;
		sJointUniqueIndicesBlockHistogram[threadIndexInWarp] = jointUniqueIndiceOffset - oriJointUniqueIndiceOffset;
		sContactHeaderBlockHistogram[threadIndexInWarp] = contactHeaderOffset - oriContactHeaderOffset;
		sJointHeaderBlockHistogram[threadIndexInWarp] = jointHeaderOffset - oriJointHeaderOffset;

		if (blockIdx.x == 0 && threadIdx.x == (WARP_SIZE - 1))
		{
			//Output total number of articulation static blocks
			const PxU32 totalNumArtiSelfBatches = contactHeaderOffset + jointHeaderOffset;
			prePrepDesc->numArtiSelfBatches = totalNumArtiSelfBatches;

			constraintPrepDesc->numArtiSelfContactBatches = contactHeaderOffset;
			constraintPrepDesc->numArtiSelf1dConstraintBatches = jointHeaderOffset;

			PxgIslandContext& island = solverCoreDesc->islandContextPool[0];
			island.mSelfArtiBatchCount = totalNumArtiSelfBatches;
		}

	}

	__syncthreads();

	//We now have the exclusive runsum for this block. Next step is to recompute the local
	//offsets within the block and output data...

	const PxU32 totalBlockRequired = (nbArticulations + (blockDim.x - 1)) / blockDim.x;

	const PxU32 numIterationPerBlock = (totalBlockRequired + (block_size - 1)) / block_size;

	const PxU32 numArtiStaticContacts = constraintPrepDesc->numArtiStaticContactBatches;
	const PxU32 numArtiStaticJoints = constraintPrepDesc->numArtiStatic1dConstraintBatches;
	const PxU32 artiSelfContactBatchOffset = prePrepDesc->artiStaticContactBatchOffset + numArtiStaticContacts;
	const PxU32 artiSelfJointBatchOffset = prePrepDesc->artiStaticConstraintBatchOffset + numArtiStaticJoints;

	PxU32* selfContactCount = prePrepDesc->mArtiSelfContactCounts;
	PxU32* selfJointCount = prePrepDesc->mArtiSelfConstraintCounts;

	for (PxU32 a = 0; a < numIterationPerBlock; ++a)
	{
		const PxU32 workIndex = a * blockDim.x + threadIdx.x + numIterationPerBlock * blockIdx.x * blockDim.x;

		PxU32 contactCount = 0;
		PxU32 jointCount = 0;
		if (workIndex < nbArticulations)
		{
			contactCount = selfContactCount[workIndex];
			jointCount = selfJointCount[workIndex];
		}

		//we need to use contactCount and jointCount later
		PxU32 sumContact = contactCount;
		PxU32 sumJoint = jointCount;
		PxU32 maxContact = contactCount;
		PxU32 maxJoint = jointCount;

		sumContact = warpReduction<AddOpPxU32, PxU32>(FULL_MASK, sumContact);
		sumJoint = warpReduction<AddOpPxU32, PxU32>(FULL_MASK, sumJoint);
		maxContact = warpReduction<MaxOpPxU32, PxU32>(FULL_MASK, maxContact);
		maxJoint = warpReduction<MaxOpPxU32, PxU32>(FULL_MASK, maxJoint);


		if (threadIndexInWarp == 31)
		{
			shContactUniqueIndicesWarpSum[warpIndex] = sumContact;
			shJointUniqueIndicesWarpSum[warpIndex] = sumJoint;
			shContactHeaderWarpSum[warpIndex] = maxContact;
			shJointHeaderWarpSum[warpIndex] = maxJoint;
		}

		__syncthreads();

		PxU32 contactWarpOffset = 0;
		PxU32 jointWarpOffset = 0;
		PxU32 contactBlockWarpOffset = 0;
		PxU32 jointBlockWarpOffset = 0;

		PxU32 contactUniqueIndicesWarpSum = 0;
		PxU32 jointUniqueIndicesWarpSum = 0;
		PxU32 contactHeaderWarpSum = 0;
		PxU32 jointHeaderWarpSum = 0;

		//warpPerBlock should be less than 32, each warp will do the runsum

		PxU32 mask_idx = __ballot_sync(FULL_MASK, threadIndexInWarp < warpPerBlock);
		if (threadIndexInWarp < warpPerBlock)
		{

			const PxU32 oriContactUniqueIndicesWarpSum = shContactUniqueIndicesWarpSum[threadIndexInWarp];
			const PxU32 oriJointUniqueIndicesWarpSum = shJointUniqueIndicesWarpSum[threadIndexInWarp];
			const PxU32 oriContactHeaderWarpSum = shContactHeaderWarpSum[threadIndexInWarp];
			const PxU32 oriJointHeaderWarpSum = shJointHeaderWarpSum[threadIndexInWarp];

			contactUniqueIndicesWarpSum = warpScan<AddOpPxU32, PxU32>(mask_idx, oriContactUniqueIndicesWarpSum);
			jointUniqueIndicesWarpSum = warpScan<AddOpPxU32, PxU32>(mask_idx, oriJointUniqueIndicesWarpSum);
			contactHeaderWarpSum = warpScan<AddOpPxU32, PxU32>(mask_idx, oriContactHeaderWarpSum);
			jointHeaderWarpSum = warpScan<AddOpPxU32, PxU32>(mask_idx, oriJointHeaderWarpSum);


			//exclusive runsum
			contactWarpOffset = contactUniqueIndicesWarpSum - oriContactUniqueIndicesWarpSum;
			jointWarpOffset = jointUniqueIndicesWarpSum - oriJointUniqueIndicesWarpSum;
			contactBlockWarpOffset = contactHeaderWarpSum - oriContactHeaderWarpSum;
			jointBlockWarpOffset = jointHeaderWarpSum - oriJointHeaderWarpSum;


		}

		//make sure each thread in a warp has the correct warp offset
		contactWarpOffset = __shfl_sync(FULL_MASK, contactWarpOffset, warpIndex);
		jointWarpOffset = __shfl_sync(FULL_MASK, jointWarpOffset, warpIndex);
		contactBlockWarpOffset = __shfl_sync(FULL_MASK, contactBlockWarpOffset, warpIndex);
		jointBlockWarpOffset = __shfl_sync(FULL_MASK, jointBlockWarpOffset, warpIndex);


		//Where the contact uniqueIds should go. This is the start of where this warp should write.
		//The uids will be interleaved depending on the number of constraints in a contact block
		//contactUniqueIndexOffset : articulation static contact start offset
		//sContactUniqueIndicesAccum :: accumulation from the previous iterations
		PxU32 contactOffset = contactUniqueIndexOffset + contactWarpOffset + sContactUniqueIndicesBlockHistogram[blockIdx.x] + sContactUniqueIndicesAccum;
		//Where the joint unique Ids should go. See above for explanation
		PxU32 jointOffset = jointUniqueIndexOffset + jointWarpOffset + sJointUniqueIndicesBlockHistogram[blockIdx.x] + sJointUniqueIndicesAccum;
		//Where the blocks should go. Shared between all threads in a block
		PxU32 contactBlockOffset = contactBlockWarpOffset + sContactHeaderBlockHistogram[blockIdx.x] + sContactHeaderAccum;
		PxU32 jointBlockOffset = jointBlockWarpOffset + sJointHeaderBlockHistogram[blockIdx.x] + sJointHeaderAccum;

		PxU32 blockOffset = contactBlockOffset + jointBlockOffset + articulationBatchOffset;

		PxgConstraintBatchHeader* batchHeaders = prePrepDesc->mBatchHeaders;

		const PxU32 numRigidBatches = constraintPrepDesc->numBatches;
		const PxU32 numArtiContactBatches = constraintPrepDesc->numArtiContactBatches;
		const PxU32 numArtiJointBatches = constraintPrepDesc->numArti1dConstraintBatches;


		//we have to sync in here so all the threads in a warp has finished reading sContactUniqueIndicesAccum, sJointUniqueIndicesAccum,
		//sContactHeaderAccum, sJointHeaderAccum before we overwrite those values for another iterations
		__syncthreads();

		if (threadIdx.x == (warpPerBlock - 1))
		{
			sContactUniqueIndicesAccum += contactUniqueIndicesWarpSum;
			sJointUniqueIndicesAccum += jointUniqueIndicesWarpSum;
			sContactHeaderAccum += contactHeaderWarpSum;
			sJointHeaderAccum += jointHeaderWarpSum;
		}

		__syncthreads();

		PxU32* contactSelfUniqueIds = prePrepDesc->mArtiSelfContactIndices;
		PxU32* jointSelfUniqueIds = prePrepDesc->mArtiSelfConstraintIndices;

		PxU32* outContactUniqueIds = prePrepDesc->mContactUniqueIndices;
		PxU32* outJointUniqueIds = prePrepDesc->mConstraintUniqueIndices;

		PxgArticulationBlockData& data = scDesc->mArticulationBlocks[workIndex / WARP_SIZE];

		if ((workIndex < nbArticulations) && (threadIndexInWarp == 0))
		{
			data.mTotalSelfConstraintCount = maxContact + maxJoint;
			data.mSelfConstraintOffset = blockOffset;
		}


		//Now we loop, outputting the self contacts/joints to the header buffer...
		for (PxU32 i = 0; i < maxContact; ++i)
		{
			PxU32 mask = __ballot_sync(FULL_MASK, contactCount > i);

			const PxU32 stride = __popc(mask);
			const PxU32 offset = warpScanExclusive(mask, threadIndexInWarp);

			if (contactCount > i)
			{
				PxU32 contactUniqueId = contactSelfUniqueIds[workIndex + nbArticulations * i];
				outContactUniqueIds[contactOffset + offset] = contactUniqueId;
			}

			if (threadIndexInWarp == 0)
			{
				const PxU32 batchIndex = contactBlockOffset + i + artiSelfContactBatchOffset;
				PxgConstraintBatchHeader header;
				header.mDescStride = stride;
				header.constraintType = PxgSolverConstraintDesc::eARTICULATION_CONTACT;
				header.mConstraintBatchIndex = batchIndex;
				header.mStartPartitionIndex = contactOffset - numRigidContacts - numRigidStaticContacts;
				header.mask = mask;
				batchHeaders[blockOffset] = header;
				artiContactConstraintBatchIndices[contactBlockOffset + numArtiContactBatches + i + numArtiStaticContacts] = blockOffset - numRigidBatches;
			}

			contactOffset += stride;
			blockOffset++;
		}

		for (PxU32 i = 0; i < maxJoint; ++i)
		{
			PxU32 mask = __ballot_sync(FULL_MASK, jointCount > i);

			const PxU32 stride = __popc(mask);
			const PxU32 offset = warpScanExclusive(mask, threadIndexInWarp);

			if (jointCount > i)
			{
				PxU32 jointUniqueId = jointSelfUniqueIds[workIndex + nbArticulations * i];
				outJointUniqueIds[jointOffset + offset] = jointUniqueId;
			}

			if (threadIndexInWarp == 0)
			{
				PxgConstraintBatchHeader header;
				header.mDescStride = stride;
				const PxU32 batchIndex = jointBlockOffset + i + artiSelfJointBatchOffset;
				header.constraintType = PxgSolverConstraintDesc::eARTICULATION_CONSTRAINT_1D;
				header.mConstraintBatchIndex = batchIndex;
				header.mStartPartitionIndex = jointOffset - numRigidJoints - numRigidStaticJoints;
				header.mask = mask;
				batchHeaders[blockOffset] = header;
				artiJointConstraintBatchIndices[jointBlockOffset + numArtiJointBatches + i + numArtiStaticJoints] = blockOffset - numRigidBatches;
			}

			jointOffset += stride;
			blockOffset++;
		}
	}

}

static __device__ void solveConstraints(PxgArticulationCoreDesc* PX_RESTRICT scDesc, const IterativeSolveData& msIterativeData,
	const PxgBlockConstraintBatch& batch, Cm::UnAlignedSpatialVector& vel0, Cm::UnAlignedSpatialVector& vel1,
	const PxgArticulationBlockLinkData& link0, const PxgArticulationBlockLinkData& link1, Cm::UnAlignedSpatialVector& impulse0,
	Cm::UnAlignedSpatialVector& impulse1, const bool doFriction, const PxReal elapsedTime, const PxReal minPen, const PxU32 offset, PxgErrorAccumulator* errorAccumulator)
{
	// For internal constraints, mass-splitting is not used; thus, reference counts are 1 (default).
	if (batch.constraintType == PxgSolverConstraintDesc::eARTICULATION_CONTACT)
	{
		solveExtContactsBlock(batch, vel0, vel1, doFriction, msIterativeData.blockContactHeaders,
			msIterativeData.blockFrictionHeaders, msIterativeData.blockContactPoints,
			msIterativeData.blockFrictions, msIterativeData.artiResponse, impulse0,
			impulse1, offset, errorAccumulator);
	}
	else
	{
		assert(batch.constraintType == PxgSolverConstraintDesc::eARTICULATION_CONSTRAINT_1D);
		solveExt1DBlock(batch, vel0, vel1, offset, msIterativeData.blockJointConstraintHeaders,
			msIterativeData.blockJointConstraintRowsCon, msIterativeData.blockJointConstraintRowsMod,
			&msIterativeData.artiResponse[batch.mArticulationResponseIndex], impulse0, impulse1,
			scDesc->mContactErrorAccumulator.mCounter >= 0);
	}
}

static __device__ void solveConstraints(PxgArticulationCoreDesc* PX_RESTRICT scDesc, const IterativeSolveDataTGS& msIterativeData,
	const PxgBlockConstraintBatch& batch, Cm::UnAlignedSpatialVector& vel0, Cm::UnAlignedSpatialVector& vel1,
	const PxgArticulationBlockLinkData& link0, const PxgArticulationBlockLinkData& link1, Cm::UnAlignedSpatialVector& impulse0,
	Cm::UnAlignedSpatialVector& impulse1, const bool doFriction, const PxReal elapsedTime, const PxReal minPen, const PxU32 offset, PxgErrorAccumulator* errorAccumulator)
{
	const Cm::UnAlignedSpatialVector delta0 = loadSpatialVector(link0.mDeltaMotion, threadIdx.x);
	const Cm::UnAlignedSpatialVector delta1 = loadSpatialVector(link1.mDeltaMotion, threadIdx.x);

	// For internal constraints, mass-splitting is not used; thus, reference counts are 1 (default).
	if (batch.constraintType == PxgSolverConstraintDesc::eARTICULATION_CONTACT)
	{
		solveExtContactBlockTGS(batch, vel0, vel1, delta0, delta1, offset,
			msIterativeData.blockContactHeaders, msIterativeData.blockFrictionHeaders, msIterativeData.blockContactPoints,
			msIterativeData.blockFrictions, msIterativeData.artiResponse, elapsedTime, minPen, impulse0, impulse1, errorAccumulator);
	}
	else
	{
		const PxQuat deltaQ0 = loadQuat(link0.mDeltaQ, threadIdx.x);
		const PxQuat deltaQ1 = loadQuat(link1.mDeltaQ, threadIdx.x);

		solveExt1DBlockTGS(batch, vel0, vel1, delta0, delta1, offset, msIterativeData.blockJointConstraintHeaders,
			msIterativeData.blockJointConstraintRowsCon, msIterativeData.artiResponse, deltaQ0, deltaQ1, elapsedTime, impulse0, impulse1,
			scDesc->mContactErrorAccumulator.mCounter >= 0);
	}
}


template <typename IterativeData>
static __device__ void artiPropagateRigidImpulsesAndSolveSelfConstraints1T(PxgArticulationCoreDesc* PX_RESTRICT scDesc,
	const PxgSolverCoreDesc* const PX_RESTRICT solverDesc, const bool velocityIteration,
	const PxReal elapsedTime, const PxgSolverSharedDesc<IterativeData>* const PX_RESTRICT sharedDesc, bool doFriction)
{
	const PxU32 nbSlabs = scDesc->nbSlabs;
	const PxU32 nbArticulations = scDesc->nbArticulations;

	const PxU32 blockStride = blockDim.x;// / WARP_SIZE;
	const PxU32 globalThreadIndex = blockIdx.x * blockStride + threadIdx.x;

	if (globalThreadIndex < nbArticulations)
	{
		const PxReal minPen = velocityIteration ? 0.f : -PX_MAX_F32;

		//Identify which block we are solving...
		PxgArticulationBlockData& articulation = scDesc->mArticulationBlocks[blockIdx.x];

		const PxU32 maxLinks = scDesc->mMaxLinksPerArticulation;
		const PxU32 numLinks = articulation.mNumLinks[threadIdx.x];

		PxgArticulationBlockLinkData* artiLinks = scDesc->mArticulationLinkBlocks + maxLinks * blockIdx.x;
		PxgArticulationBlockDofData* artiDofs =
			scDesc->mArticulationDofBlocks + scDesc->mMaxDofsPerArticulation * blockIdx.x;

		const PxU32 articulationReferenceCountOffset =
			solverDesc->islandContextPool->mBodyCount + solverDesc->islandContextPool->mBodyStartIndex;
		const PxU32 numTotalBodies = articulationReferenceCountOffset + nbArticulations;

		// When there are impulses to propagate (must be impulses from rigid body contacts and joints), make sure to
		// propagate them here. This is irrelevant to whether there are self-constraints or not.
		PxU32 dirtyState = articulation.mStateDirty[threadIdx.x];
		if(dirtyState)
		{
			if(dirtyState & PxgArtiStateDirtyFlag::eHAS_IMPULSES)
			{
				// Counting the number of active slabs used in contacts and joints.
				// The split mass used in contacts and joints is tied back here.
				const PxU32 referenceCount = countActiveSlabs(articulationReferenceCountOffset + globalThreadIndex, nbSlabs, numTotalBodies,
				                                              sharedDesc->iterativeData.solverEncodedReferenceCount);

				const PxReal scale = 1.0f / static_cast<PxReal>(referenceCount);

				averageLinkImpulsesAndPropagate(scDesc->slabHasChanges, scDesc->impulses, articulation, artiLinks, artiDofs,
												globalThreadIndex, scDesc->mMaxLinksPerArticulation, nbArticulations, nbSlabs, numLinks,
												threadIdx.x, scale);
			}

			// Leave velocities at the root for now. We will figure out how to do this more lazily next...
			/*PxcFsFlushVelocity(articulation, artiLinks, artiDofs, numLinks,
			    articulation.mFlags[threadIdx.x] & PxArticulationFlag::eFIX_BASE, threadIdx.x);*/
		}

		// Resetting articulation reference counts after all usage.
		resetSlabCount(articulationReferenceCountOffset + globalThreadIndex, nbSlabs, numTotalBodies,
		               sharedDesc->iterativeData.solverEncodedReferenceCount);

		articulation.mStateDirty[threadIdx.x] = PxgArtiStateDirtyFlag::eVEL_DIRTY;

		// Solve self-constraints.
		const PxU32 nbSelfConstraints = articulation.mTotalSelfConstraintCount;
		const PxU32 startIndex = articulation.mSelfConstraintOffset;
		const PxU32 wordSize = (maxLinks + 63) / 64;
		PxgArticulationBitFieldData* linkBitFields = scDesc->mPathToRootBitFieldBlocks + maxLinks * wordSize * blockIdx.x;

		PxgArticulationBitFieldStackData* pathToRootBitFieldA = scDesc->mTempSharedBitFieldBlocks + wordSize * blockIdx.x;
		PxgArticulationBitFieldStackData* pathToRootBitFieldB = scDesc->mTempRootBitFieldBlocks + wordSize * blockIdx.x;
		PxgArticulationBitFieldStackData* commonBitField = scDesc->mTempPathToRootBitFieldBlocks + wordSize * blockIdx.x;

		Cm::UnAlignedSpatialVector rootDeferredZ = loadSpatialVector(articulation.mRootDeferredZ, threadIdx.x);
		Dy::SpatialMatrix rootInvArticulatedInertia;
		loadSpatialMatrix(articulation.mInvSpatialArticulatedInertia, threadIdx.x, rootInvArticulatedInertia);

		if(nbSelfConstraints)
		{
			const IterativeData& iterativeData = sharedDesc->iterativeData;

			PxgErrorAccumulator error;
			const bool accumulateError = scDesc->mContactErrorAccumulator.mCounter >= 0;

			for (PxU32 i = 0; i < nbSelfConstraints; ++i)
			{
				const PxgBlockConstraintBatch& batch = iterativeData.blockConstraintBatch[startIndex + i];

				PxU32 mask = batch.mask;

				if (mask & (1 << threadIdx.x))
				{

					const Cm::UnAlignedSpatialVector commonDelta = rootInvArticulatedInertia * -rootDeferredZ;

					const PxU32 offset = warpScanExclusive(mask, threadIdx.x);

					const PxNodeIndex igNodeIndexA = batch.bodyANodeIndex[offset];
					const PxNodeIndex igNodeIndexB = batch.bodyBNodeIndex[offset];

					//Get velocities for these links...

					assert(igNodeIndexA.index() == igNodeIndexB.index());
					const PxU32 linkIDA = igNodeIndexA.articulationLinkId();
					const PxU32 linkIDB = igNodeIndexB.articulationLinkId();

					PxgArticulationBlockLinkData& linkA = artiLinks[linkIDA];
					PxgArticulationBlockLinkData& linkB = artiLinks[linkIDB];

					for (PxU32 i = 0; i < wordSize; ++i)
					{
						const PxU64 wordA = linkBitFields[linkIDA * wordSize + i].bitField[threadIdx.x];
						const PxU64 wordB = linkBitFields[linkIDB * wordSize + i].bitField[threadIdx.x];
						pathToRootBitFieldA[i].bitField[threadIdx.x] = wordA;
						pathToRootBitFieldB[i].bitField[threadIdx.x] = wordB;
						commonBitField[i].bitField[threadIdx.x] = wordA & wordB;
					}

					Cm::UnAlignedSpatialVector velA = loadSpatialVector(linkA.mMotionVelocity, threadIdx.x);
					Cm::UnAlignedSpatialVector velB = loadSpatialVector(linkB.mMotionVelocity, threadIdx.x);

					Cm::UnAlignedSpatialVector deltaVA = commonDelta;

					for (PxU32 i = 0, wordOffset = 0; i < wordSize; ++i, wordOffset += 64)
					{
						PxU64 word = commonBitField[i].bitField[threadIdx.x];
						while (word != 0)
						{
							const PxU32 index = articulationLowestSetBit(word) + wordOffset;

							if (index != 0) // need to skip root because it has no parent and dofs.
							{
								const PxgArticulationBlockLinkData& link = artiLinks[index];

								deltaVA = propagateAccelerationWNoJVelUpdate(link, &artiDofs[link.mJointOffset[threadIdx.x]], link.mDofs[threadIdx.x],
									deltaVA, threadIdx.x);
							}
							//clear the lowest bit
							word &= (word - 1);
						}
					}


					//Now prop to linkA and linkB...
					Cm::UnAlignedSpatialVector deltaVB = deltaVA;

					for (PxU32 i = 0; i < wordSize; ++i)
					{
						pathToRootBitFieldA[i].bitField[threadIdx.x] &= (~commonBitField[i].bitField[threadIdx.x]);
						pathToRootBitFieldB[i].bitField[threadIdx.x] &= (~commonBitField[i].bitField[threadIdx.x]);
					}

					for (PxU32 i = 0, wordOffset = 0; i < wordSize; ++i, wordOffset += 64)
					{
						PxU64 word = pathToRootBitFieldA[i].bitField[threadIdx.x];
						while (word != 0)
						{
							const PxU32 index = articulationLowestSetBit(word) + wordOffset;

							assert(index != 0); //root should be lowered when we remove the common path above.

							const PxgArticulationBlockLinkData& link = artiLinks[index];

							deltaVA = propagateAccelerationWNoJVelUpdate(link, &artiDofs[link.mJointOffset[threadIdx.x]], link.mDofs[threadIdx.x],
								deltaVA, threadIdx.x);

							//clear the lowest bit
							word &= (word - 1);
						}
					}

					for (PxU32 i = 0, wordOffset = 0; i < wordSize; ++i, wordOffset += 64)
					{
						PxU64 word = pathToRootBitFieldB[i].bitField[threadIdx.x];
						while (word != 0)
						{
							const PxU32 index = articulationLowestSetBit(word) + wordOffset;

							assert(index != 0); //root should be lowered when we remove the common path above.

							const PxgArticulationBlockLinkData& link = artiLinks[index];

							deltaVB = propagateAccelerationWNoJVelUpdate(link, &artiDofs[link.mJointOffset[threadIdx.x]], link.mDofs[threadIdx.x],
								deltaVB, threadIdx.x);

							//clear the lowest bit
							word &= (word - 1);
						}
					}

					//Now we have updated velocities...
					velA += deltaVA;
					velB += deltaVB;


					//Now do the solve

					Cm::UnAlignedSpatialVector impulse0(PxVec3(0.f), PxVec3(0.f));
					Cm::UnAlignedSpatialVector impulse1(PxVec3(0.f), PxVec3(0.f));

					__syncwarp(mask);

					solveConstraints(scDesc, iterativeData, batch, velA, velB, linkA, linkB, impulse0, impulse1,
						doFriction, elapsedTime, minPen, offset, accumulateError ? &error : NULL);

					//Prop up

					for (PxI32 i = wordSize-1, wordOffset = (wordSize-1)*64; i >= 0; i--, wordOffset -= 64)
					{
						PxU64 word = pathToRootBitFieldB[i].bitField[threadIdx.x];
						while (word != 0)
						{
							const PxU32 index = articulationHighestSetBit(word);
							//clear the highest bit
							word &= (~(1ull << index));

							assert((index + wordOffset) != 0); //root should be lowered when we remove the common path above.

							const PxgArticulationBlockLinkData& link = artiLinks[index + wordOffset];

							PxgArticulationBlockDofData* PX_RESTRICT dofData = &artiDofs[link.mJointOffset[threadIdx.x]];

							const float child2Parent_x = link.mRw_x[threadIdx.x];
							const float child2Parent_y = link.mRw_y[threadIdx.x];
							const float child2Parent_z = link.mRw_z[threadIdx.x];

							const PxU32 dofCount = link.mDofs[threadIdx.x];

							impulse1 = propagateImpulseW_0(PxVec3(child2Parent_x, child2Parent_y, child2Parent_z),
								dofData, impulse1, dofCount, threadIdx.x);
						}
					}

					for (PxI32 i = wordSize-1, wordOffset = (wordSize-1)*64; i >= 0; i--, wordOffset -= 64)
					{
						PxU64 word = pathToRootBitFieldA[i].bitField[threadIdx.x];
						while (word != 0)
						{
							const PxU32 index = articulationHighestSetBit(word);
							//clear the highest bit
							word &= (~(1ull << index));

							assert((index + wordOffset) != 0); //root should be lowered when we remove the common path above.

							const PxgArticulationBlockLinkData& link = artiLinks[index + wordOffset];

							PxgArticulationBlockDofData* PX_RESTRICT dofData = &artiDofs[link.mJointOffset[threadIdx.x]];

							const float rwx = link.mRw_x[threadIdx.x];
							const float rwy = link.mRw_y[threadIdx.x];
							const float rwz = link.mRw_z[threadIdx.x];

							const PxU32 dofCount = link.mDofs[threadIdx.x];

							impulse0 = propagateImpulseW_0(PxVec3(rwx, rwy, rwz),
								dofData, impulse0, dofCount, threadIdx.x);
						}
					}

					impulse0 += impulse1;


					for (PxI32 i = wordSize-1, wordOffset = (wordSize-1)*64; i >= 0; i--, wordOffset -= 64)
					{
						PxU64 word = commonBitField[i].bitField[threadIdx.x];
						while (word != 0)
						{
							const PxU32 index = articulationHighestSetBit(word);

							// need to skip root again because it is part of the common path but it has no dofs.
							// we can break because we go from leaf to root.
							if ((index + wordOffset) == 0)
								break;

							//clear the highest bit
							word &= (~(1ull << index));

							const PxgArticulationBlockLinkData& link = artiLinks[index + wordOffset];

							PxgArticulationBlockDofData* PX_RESTRICT dofData = &artiDofs[link.mJointOffset[threadIdx.x]];

							const float rwx = link.mRw_x[threadIdx.x];
							const float rwy = link.mRw_y[threadIdx.x];
							const float rwz = link.mRw_z[threadIdx.x];
							const PxU32 dofCount = link.mDofs[threadIdx.x];

							impulse0 = propagateImpulseW_0(PxVec3(rwx, rwy, rwz),
								dofData, impulse0, dofCount, threadIdx.x);
						}
					}

					rootDeferredZ += impulse0;
				}
			}

			storeSpatialVector(articulation.mRootDeferredZ, rootDeferredZ, threadIdx.x);
		}
	}
}


//two warp each block
extern "C" __global__
void artiPropagateRigidImpulsesAndSolveSelfConstraints1T(PxgArticulationCoreDesc* PX_RESTRICT scDesc, const PxgSolverCoreDesc* const PX_RESTRICT solverDesc,
	const bool velocityIteration, const PxReal elapsedTime,	const PxgSolverSharedDesc<IterativeSolveData>* const PX_RESTRICT sharedDesc, bool doFriction)
{
	artiPropagateRigidImpulsesAndSolveSelfConstraints1T<IterativeSolveData>(scDesc, solverDesc, velocityIteration, 0.0f, sharedDesc, doFriction);
}


extern "C" __global__
void artiPropagateRigidImpulsesAndSolveSelfConstraintsTGS1T(PxgArticulationCoreDesc* PX_RESTRICT scDesc, const PxgSolverCoreDesc* const PX_RESTRICT solverDesc,
	const bool velocityIteration, const PxReal elapsedTime, const PxgSolverSharedDesc<IterativeSolveDataTGS>* const PX_RESTRICT sharedDesc, bool doFriction)
{
	artiPropagateRigidImpulsesAndSolveSelfConstraints1T<IterativeSolveDataTGS>(scDesc, solverDesc, velocityIteration, elapsedTime, sharedDesc, doFriction);
}