// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of NVIDIA CORPORATION nor the names of its
//    contributors may be used to endorse or promote products derived
//    from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.

#include "contactConstraintBlockPrep.cuh"
#include "PxgArticulation.h"
#include "PxgSolverBody.h"
#include "PxgConstraint.h"
#include "PxgFrictionPatch.h"
#include "PxgConstraintPrep.h"
#include "PxgSolverConstraintDesc.h"
#include "PxgArticulationCoreDesc.h"

#include "PxgSolverCoreDesc.h"
#include "PxgSolverFlags.h"
#include "PxgSolverConstraintBlock1D.h"

#include "DyCpuGpu1dConstraint.h"
#include "DyCpuGpuArticulation.h"
#include "DyConstraintPrep.h"

#include "jointConstraintBlockPrep.cuh"

#include <assert.h>


using namespace physx;

extern "C" __host__ void initSolverKernels13() {}

static __device__ void createPxgSolverExtBody(const PxNodeIndex& nodeIndex, const PxU32 solverBodyIndex, const PxgArticulation* const PX_RESTRICT articulations,
	const physx::PxgSolverBodyData* const PX_RESTRICT solverBodyData,
	const PxgSolverTxIData* const PX_RESTRICT txIDatas, physx::PxgSolverExtBody2& b,
	const PxAlignedTransform* const PX_RESTRICT bodyFrames)
{
	if (nodeIndex.isArticulation())
	{
		PxU32 linkID = nodeIndex.articulationLinkId();
		const PxgArticulation& artic = articulations[solverBodyIndex];

		b.bodyIndex = solverBodyIndex;
		b.linkIndex = linkID;
		b.body2World = artic.linkBody2Worlds[linkID];
		b.mSpatialResponse = artic.spatialResponseMatrixW[linkID];
		b.isKinematic = false;
		b.islandNodeIndex = nodeIndex;
		b.penBiasClamp = artic.links[linkID].initialAngVelXYZ_penBiasClamp.w;
		b.maxImpulse = PX_MAX_F32; //KS - TODO - hook up!
		b.velocity = artic.motionVelocities[linkID]; //KS - TODO - verify this is set up
		b.cfm = artic.cfms[linkID];
		b.offsetSlop = artic.links[linkID].offsetSlop;
		/*printf("%i response = (%f, %f, %f, %f, %f, %f), (%f, %f, %f, %f, %f, %f), (%f, %f, %f, %f, %f, %f), (%f, %f, %f, %f, %f, %f)\n", 
			linkID, b.mSpatialResponse.column[0][0], b.mSpatialResponse.column[0][1], b.mSpatialResponse.column[0][2], b.mSpatialResponse.column[0][3], b.mSpatialResponse.column[0][4], b.mSpatialResponse.column[0][5],
			b.mSpatialResponse.column[1][0], b.mSpatialResponse.column[1][1], b.mSpatialResponse.column[1][2], b.mSpatialResponse.column[1][3], b.mSpatialResponse.column[1][4], b.mSpatialResponse.column[1][5],
			b.mSpatialResponse.column[2][0], b.mSpatialResponse.column[2][1], b.mSpatialResponse.column[2][2], b.mSpatialResponse.column[2][3], b.mSpatialResponse.column[2][4], b.mSpatialResponse.column[2][5],
			b.mSpatialResponse.column[3][0], b.mSpatialResponse.column[3][1], b.mSpatialResponse.column[3][2], b.mSpatialResponse.column[3][3], b.mSpatialResponse.column[3][4], b.mSpatialResponse.column[3][5]);
		printf("%i response part 2 = (%f, %f, %f, %f, %f, %f), (%f, %f, %f, %f, %f, %f)\n",
			linkID, b.mSpatialResponse.column[4][0], b.mSpatialResponse.column[4][1], b.mSpatialResponse.column[4][2], b.mSpatialResponse.column[4][3], b.mSpatialResponse.column[4][4], b.mSpatialResponse.column[4][5],
			b.mSpatialResponse.column[5][0], b.mSpatialResponse.column[5][1], b.mSpatialResponse.column[5][2], b.mSpatialResponse.column[5][3], b.mSpatialResponse.column[5][4], b.mSpatialResponse.column[5][5]);*/
	}
	else
	{
		bool isStatic = nodeIndex.isStaticBody();
		const PxgSolverBodyData& solverBody = solverBodyData[solverBodyIndex];
		const bool isKinematic = solverBody.flags & PxRigidBodyFlag::eKINEMATIC;

		Cm::UnAlignedSpatialVector velocity(PxVec3(0.f), PxVec3(0.f));
		PxReal penBiasClamp = -PX_MAX_F32;
		PxReal invMass = 0.f;
		PxAlignedTransform body2World = bodyFrames[solverBodyIndex];
		PxReal offsetSlop = 0.f;
		if (!isStatic)
		{
			float4	initialAngVelXYZ_penBiasClamp = solverBody.initialAngVelXYZ_penBiasClamp;
			float4	initialLinVelXYZ_invMassW = solverBody.initialLinVelXYZ_invMassW;
			penBiasClamp = initialAngVelXYZ_penBiasClamp.w;
			invMass = initialLinVelXYZ_invMassW.w;

			velocity.top = PxVec3(initialAngVelXYZ_penBiasClamp.x, initialAngVelXYZ_penBiasClamp.y, initialAngVelXYZ_penBiasClamp.z);
			velocity.bottom = PxVec3(initialLinVelXYZ_invMassW.x, initialLinVelXYZ_invMassW.y, initialLinVelXYZ_invMassW.z);
			offsetSlop = solverBody.offsetSlop;
		}

		if (isStatic || isKinematic)
		{
			memset(b.mSpatialResponse.column, 0, sizeof(b.mSpatialResponse));
		}
		else
		{
			/*printf("SqrtInvInertia = (%f, %f, %f), (%f, %f, %f), (%f, %f, %f)\n", txIDatas[solverBodyIndex].sqrtInvInertia.column0.x,
				txIDatas[solverBodyIndex].sqrtInvInertia.column0.y, txIDatas[solverBodyIndex].sqrtInvInertia.column0.z,
				txIDatas[solverBodyIndex].sqrtInvInertia.column1.x, txIDatas[solverBodyIndex].sqrtInvInertia.column1.y, 
				txIDatas[solverBodyIndex].sqrtInvInertia.column1.z, txIDatas[solverBodyIndex].sqrtInvInertia.column2.x,
				txIDatas[solverBodyIndex].sqrtInvInertia.column2.y, txIDatas[solverBodyIndex].sqrtInvInertia.column2.z);*/
			b.mSpatialResponse.initialize(txIDatas[solverBodyIndex].sqrtInvInertia, invMass);
		}

		b.isKinematic = isKinematic;
		b.linkIndex = PxSolverConstraintDesc::RIGID_BODY;
		b.bodyIndex = solverBodyIndex;
		b.islandNodeIndex = nodeIndex;
		b.penBiasClamp = penBiasClamp;
		b.body2World.p = PxVec3(body2World.p.x, body2World.p.y, body2World.p.z);
		b.body2World.q = body2World.q;
		b.maxImpulse = solverBody.maxImpulse;
		b.velocity = velocity;
		b.cfm = 0.f;
		b.offsetSlop = offsetSlop;
	}
}

static __device__ Cm::UnAlignedSpatialVector createImpulseResponseVector(const physx::PxgSolverExtBody2& b, const Cm::UnAlignedSpatialVector& impulse)
{
	
	if (b.islandNodeIndex.isArticulation() || b.isKinematic)
	{
		//We do not alter the impulse. vector - no need because the velocity of the actor is in world space (unscaled)!
		return impulse;
	}
	else
	{
		//For rigid bodies, the angular velocity is scaled by sqrtInertia. This allows us to use the same vector to
		//project from sqrtInertia space to velocity space and from momentum (inertia) space to sqrtInertia space.
		return Cm::UnAlignedSpatialVector(b.mSpatialResponse.multiplyInertia(impulse.top), impulse.bottom);
	}
}

static __device__ PxReal projectVelocity(const PxgSolverExtBody2& b, const Cm::UnAlignedSpatialVector& responseVector)
{
	return b.velocity.dot(responseVector);
}

static __device__ PxReal projectAngular(const PxgSolverExtBody2& b, const PxVec3& responseVector)
{
	return b.velocity.top.dot(responseVector);
}

static __device__ Cm::UnAlignedSpatialVector getImpulseResponse(const PxgSolverExtBody2& b, const Cm::UnAlignedSpatialVector& vector)
{
	return b.mSpatialResponse * vector;
}


//for PGS solver(invDtF32 and invTotalDt is the same)
static __device__ void setupFinalizeExtSolverConstraintsBlock(PxgBlockContactData& contactData,
	const PxgBlockContactPoint* PX_RESTRICT contacts,
	const PxU32 contactCount,
	const PxgBlockFrictionPatch& frictionPatch,
	const PxgBlockFrictionAnchorPatch& fAnchor,
	const PxgSolverExtBody2& b0,
	const PxgSolverExtBody2& b1,
	const PxReal invDtF32,
	const PxReal dt,
	const PxReal invTotalDt,
	const PxReal bounceThresholdF32,
	const PxReal biasCoefficient,
	const PxU32 threadIndex,
	PxU32 forceWritebackBufferOffset,
	const bool perPointFriction,
	PxgContactBlockParams& params,
	PxgArticulationBlockResponse* PX_RESTRICT articulationResponses,
	const PxgMaterialContactData& data,
	const PxReal ccdMaxSeparationThreshold,
	const PxReal solverOffsetSlop,
	const float2 torsionalFrictionData,
	const PxReal totalDt)
{
	using namespace physx;
	// NOTE II: the friction patches are sparse (some of them have no contact patches, and
	// therefore did not get written back to the cache) but the patch addresses are dense,
	// corresponding to valid patches

	/*const float4 data_ = reinterpret_cast<float4&>(contactData.contactData[threadIndex]);
	const PxgMaterialContactData data = reinterpret_cast<const PxgMaterialContactData&>(data_);*/

	PxU8 flags = data.mSolverFlags; // encoding according to PxgSolverContactFlags

	const float restDistance = data.restDistance;//FLoad(n.restDistance); 

	const PxVec3& bodyFrame0p = b0.body2World.p;

	const PxAlignedQuat& bodyFrame1q = b1.body2World.q;//QuatVLoadU(&bodyFrame1.q.x);
	const PxVec3& bodyFrame1p = b1.body2World.p;

	uint32_t frictionPatchWritebackAddrIndex = 0;
	uint32_t contactWritebackCount = 0;

	//TODO - fix up!!!!
	bool isCCD = false;// (data0.islandNodeIndex & 1) || data1.islandNodeIndex & 1;
	const PxReal ccdMaxSeparation = isCCD ? ccdMaxSeparationThreshold : PX_MAX_F32;

	const float maxPenBias = fmaxf(b0.penBiasClamp, b1.penBiasClamp);
	//printf("%i: MaxPenBias = %f\n", threadIndex, maxPenBias);

	const float invDt = invDtF32;
	const float p8 = 0.8f;
	const float bounceThreshold = bounceThresholdF32;

	const float invDtp8 = invDt * p8;//FMul(invDt, p8);

	const float4 lin0X_ang0Y_lin1Z_ang1W = contactData.mInvMassScale[threadIndex].lin0X_ang0Y_lin1Z_ang1W;

	const PxVec3& linVel0 = b0.velocity.bottom;
	const PxVec3& linVel1 = b1.velocity.bottom;
	const PxVec3& angVel0 = b0.velocity.top;
	const PxVec3& angVel1 = b1.velocity.top;

	const PxReal cfm = PxMax(b0.cfm, b1.cfm);

	{
		PxU32 anchorCount = frictionPatch.anchorCount[threadIndex] * 2;

		//shared memory counter for max contacts in the friction patch...

		if (contactCount == 0)
		{
			//printf("%i: Not Prepping %i contacts\n", threadIndex, contactCount);
			params.blockFrictionHeader->numFrictionConstr[threadIndex] = 0;
			params.blockContactHeader->numNormalConstr[threadIndex] = 0;
			params.blockContactHeader->forceWritebackOffset[threadIndex] = 0xffffffff;
			return;
		}

		//printf("Prepping %i contacts!\n", contactCount);
		PxReal maxImpulse = PxMin(b0.maxImpulse, b1.maxImpulse);

		params.blockContactHeader->forceWritebackOffset[threadIndex] = forceWritebackBufferOffset;

		params.blockContactHeader->flags[threadIndex] = flags;

		//KS - despite the variable name, this only stores the invMassScale terms for extended contacts
		params.blockContactHeader->invMass0_1_angDom0_1[threadIndex] = make_float4(lin0X_ang0Y_lin1Z_ang1W.x, lin0X_ang0Y_lin1Z_ang1W.z, lin0X_ang0Y_lin1Z_ang1W.y, lin0X_ang0Y_lin1Z_ang1W.w);

		const float4 normal4_rW = contactData.normal_restitutionW[threadIndex];
		const PxVec3 normal(normal4_rW.x, normal4_rW.y, normal4_rW.z);
		const PxReal restitution = normal4_rW.w;
		const PxReal damping = contactData.damping[threadIndex];
		const PxReal normalLenSq = normal.magnitudeSquared();//V3LengthSq(normal);
		const PxReal norVel0 = normal.dot(linVel0);//V3Dot(normal, linVel0);
		const PxReal norVel1 = normal.dot(linVel1);//V3Dot(normal, linVel1);
		const PxReal linNorVel = norVel0 - norVel1;

		params.blockContactHeader->restitution[threadIndex] = restitution;
		params.blockContactHeader->cfm[threadIndex] = cfm;

		PxgArticulationBlockResponse* PX_RESTRICT artiResponse = articulationResponses;

		//printf("%i: Prepping %i contacts\n", threadIndex, contactCount);

		PxReal normalForce = 0.f;

		for (PxU32 j = 0; j<contactCount; j++)
		{
			const PxgBlockContactPoint& contact = contacts[j];

			PxgBlockSolverContactPoint* solverContact = &params.blockContactPoints[j];

			//const Vec3V targetVel = Vec3V_From_PxVec3_Aligned(contact.targetVel);

			const float4 point4_separationW = contact.point_separationW[threadIndex];
			const PxVec3 point(point4_separationW.x, point4_separationW.y, point4_separationW.z);
			const float separation = point4_separationW.w;

			const float4 targetVel4_maxImpulseW = contact.targetVel_maxImpulseW[threadIndex];
			const PxVec3 targetVel(targetVel4_maxImpulseW.x, targetVel4_maxImpulseW.y, targetVel4_maxImpulseW.z);

			const float cTargetVel = normal.dot(targetVel);//V3Dot(normal, V3LoadA(contact.targetVel));

			const PxVec3 ra = point - bodyFrame0p;//V3Sub(point, bodyFrame0p);
			const PxVec3 rb = point - bodyFrame1p;//V3Sub(point, bodyFrame1p);

			PxVec3 raXn = ra.cross(normal);
			PxVec3 rbXn = rb.cross(normal);

			float angNorVel = raXn.dot(angVel0) - rbXn.dot(angVel1);

			const PxReal slop = solverOffsetSlop * PxMax(linNorVel == 0.f ? 1.f : angNorVel / linNorVel, 1.f);

			raXn.x = PxAbs(raXn.x) < slop ? 0.f : raXn.x;
			raXn.y = PxAbs(raXn.y) < slop ? 0.f : raXn.y;
			raXn.z = PxAbs(raXn.z) < slop ? 0.f : raXn.z;

			rbXn.x = PxAbs(rbXn.x) < slop ? 0.f : rbXn.x;
			rbXn.y = PxAbs(rbXn.y) < slop ? 0.f : rbXn.y;
			rbXn.z = PxAbs(rbXn.z) < slop ? 0.f : rbXn.z;

			//printf("%i: normal = (%f, %f, %f), raXn = (%f, %f, %f), rbXn = (%f, %f, %f)\n", threadIndex, normal.x, normal.y, normal.z, raXn.x, raXn.y, raXn.z,
			//	rbXn.x, rbXn.y, rbXn.z);

			const Cm::UnAlignedSpatialVector response0 = createImpulseResponseVector(b0, Cm::UnAlignedSpatialVector(raXn, normal));
			const Cm::UnAlignedSpatialVector response1 = createImpulseResponseVector(b1, Cm::UnAlignedSpatialVector(-rbXn, -normal));

			//printf("%i: response0 = (%f, %f, %f, %f, %f, %f), response1 = (%f, %f, %f, %f, %f, %f)\n", 
			//	threadIndex, response0.top.x, response0.top.y, response0.top.z, response0.bottom.x, response0.bottom.y, response0.bottom.z,
			//	response1.top.x, response1.top.y, response1.top.z, response1.bottom.x, response1.bottom.y, response1.bottom.z);

			const Cm::UnAlignedSpatialVector deltaV0 = getImpulseResponse(b0, Cm::UnAlignedSpatialVector(normal, raXn).scale(lin0X_ang0Y_lin1Z_ang1W.x, lin0X_ang0Y_lin1Z_ang1W.y));
			const Cm::UnAlignedSpatialVector deltaV1 = getImpulseResponse(b1, Cm::UnAlignedSpatialVector(-normal, -rbXn).scale(lin0X_ang0Y_lin1Z_ang1W.z, lin0X_ang0Y_lin1Z_ang1W.w));

			//printf("%i: DeltaV0 = (%f, %f, %f, %f, %f, %f), deltaV1 = (%f, %f, %f, %f, %f, %f)\n", threadIndex, deltaV0.top.x, deltaV0.top.y, deltaV0.top.z,
			//	deltaV0.bottom.x, deltaV0.bottom.y, deltaV0.bottom.z, deltaV1.top.x, deltaV1.top.y, deltaV1.top.z,
			//	deltaV1.bottom.x, deltaV1.bottom.y, deltaV1.bottom.z);

			const float resp0 = deltaV0.dot(response0);
			const float resp1 = deltaV1.dot(response1);

			const float unitResponse = resp0 + resp1;//FAdd(resp0, resp1);

			const float vrel1 = norVel0 + projectAngular(b0, raXn);//FAdd(norVel0, V3Dot(raXn, angVel0));
			const float vrel2 = norVel1 + projectAngular(b1, rbXn);//FAdd(norVel1, V3Dot(rbXn, angVel1));
			const float vrel = vrel1 - vrel2;//FSub(vrel1, vrel2);

			//printf("%i: vrel = %f, vrel1 = %f, vrel2 = %f\n", threadIndex, vrel, vrel1, vrel2);


			const float penetration = separation - restDistance;
			const bool isSeparated = (penetration >= 0.0f);
			const float penetrationInvDt = penetration * invDt;
			const float penetrationInvDtPt8 = fmaxf(maxPenBias, penetration*invDtp8);

			const bool collidingWithVrel = ((-vrel) > penetrationInvDt);
			const bool isGreater2 = (restitution > 0.f) && (bounceThreshold > vrel) && collidingWithVrel;

			const bool ccdSeparationCondition = ccdMaxSeparation >= penetration;

			const float sumVRel(vrel);

			float targetVelocity = cTargetVel + (isGreater2 ? (-sumVRel)*restitution : 0.f);//FAdd(cTargetVel, FSel(isGreater2, FMul(FNeg(sumVRel), restitution), zero));

			const PxReal tvel = targetVelocity;

			if (!b0.islandNodeIndex.isArticulation())
				targetVelocity -= vrel1;
			if (!b1.islandNodeIndex.isArticulation())
				targetVelocity += vrel2;

			const float recipResponse = (unitResponse > 0.0f) ? (1.f / (unitResponse + cfm)) : 0.f; //FSel(FIsGrtr(unitResponse, zero), FRecip(unitResponse), zero);
			PxReal biasedErr, unbiasedErr, impulseMultiplier;
			PxReal velMultiplier = recipResponse;
			
			// To compute velMultiplier, impulseMultiplier, unbiasedErr, and biasedErr every sub-timestep or iteration,
			// additional data is stored: coeff0, coeff1.
			// Using coeff0 and coeff1 produces the same results as the previous implementation when mass-splitting does
			// not occur. See also "computeContactCoefficients".

			PxReal coeff0, coeff1;
			if (restitution < 0.f) // compliant contact case
			{
				coeff0 = computeCompliantContactCoefficients(
				    dt, flags, restitution, damping, unitResponse, recipResponse, penetration, targetVelocity,
				    isSeparated, collidingWithVrel, velMultiplier, impulseMultiplier, unbiasedErr, biasedErr);

				const PxReal nrdt = dt * restitution;
				coeff1 = -nrdt * penetration; 
			}
			else
			{
				float scaledBias = isSeparated ? penetrationInvDt : penetrationInvDtPt8;
				if(ccdSeparationCondition && isGreater2)
					scaledBias = 0.f;

				unbiasedErr = targetVelocity;
				biasedErr = unbiasedErr - scaledBias;

				if(!isGreater2)
					unbiasedErr -= PxMax(0.f, scaledBias);

				coeff0 = biasedErr; // When recovering full biasedErr, multiply coeff0 by velMultiplier. final biasedErr = coeff0 * velMultiplier
				coeff1 = unbiasedErr; // When recovering full unbiasedErr, multiply coeff0 by velMultiplier. final unbiasedErr = coeff1 * velMultiplier
			}

			solverContact->raXn_targetVelocity[threadIndex] = make_float4(response0.top.x, response0.top.y, response0.top.z, targetVelocity);
			solverContact->rbXn_maxImpulse[threadIndex] = make_float4(-response1.top.x, -response1.top.y, -response1.top.z, PxMin(maxImpulse, targetVel4_maxImpulseW.w));

			solverContact->resp0[threadIndex] = resp0;
			solverContact->resp1[threadIndex] = resp1;

			solverContact->coeff0[threadIndex] = coeff0;
			solverContact->coeff1[threadIndex] = coeff1;

			solverContact->appliedForce[threadIndex] = 0.f;

			// Note: 'deltaF' is used here only for an approximate query related to minNormalForce.
			// Mass-splitting is not applied in this context.
			// To apply mass-splitting when computing 'minNormalForce', perform the calculations in solveExtContactsBlock.
			const PxReal deltaF = fmaxf((tvel - (vrel + fmaxf(penetrationInvDt, 0.f))) * velMultiplier, 0.f);
			normalForce += deltaF;

			artiResponse->deltaRALin_x[threadIndex] = deltaV0.bottom.x;
			artiResponse->deltaRALin_y[threadIndex] = deltaV0.bottom.y;
			artiResponse->deltaRALin_z[threadIndex] = deltaV0.bottom.z;
			artiResponse->deltaRAAng_x[threadIndex] = deltaV0.top.x;
			artiResponse->deltaRAAng_y[threadIndex] = deltaV0.top.y;
			artiResponse->deltaRAAng_z[threadIndex] = deltaV0.top.z;
			artiResponse->deltaRBLin_x[threadIndex] = deltaV1.bottom.x;
			artiResponse->deltaRBLin_y[threadIndex] = deltaV1.bottom.y;
			artiResponse->deltaRBLin_z[threadIndex] = deltaV1.bottom.z;
			artiResponse->deltaRBAng_x[threadIndex] = deltaV1.top.x;
			artiResponse->deltaRBAng_y[threadIndex] = deltaV1.top.y;
			artiResponse->deltaRBAng_z[threadIndex] = deltaV1.top.z;
			artiResponse++;
		}

		params.blockContactHeader->minNormalForce[threadIndex] = (normalForce / contactCount)*0.25f;

		contactWritebackCount += contactCount;

		const PxU32 aCount = frictionPatch.anchorCount[threadIndex];

		PxReal frictionScale = (aCount == 2) ? 0.5f : 1.f;

		const PxReal staticFriction = data.staticFriction*frictionScale;
		const PxReal dynamicFriction = data.dynamicFriction*frictionScale;

		const bool haveFriction = (anchorCount != 0);//PX_IR(n.staticFriction) > 0 || PX_IR(n.dynamicFriction) > 0;
		params.blockContactHeader->numNormalConstr[threadIndex] = contactCount;
		params.blockFrictionHeader->numFrictionConstr[threadIndex] = anchorCount;

		//printf("threadIdx %i numNormalConst %i numFrictionConstr %i\n", threadIndex, contactCount, anchorCount);

		//header->type				= type;

		params.blockContactHeader->normal_staticFriction[threadIndex] = make_float4(normal.x, normal.y, normal.z, staticFriction);
		params.blockFrictionHeader->dynamicFriction[threadIndex] = dynamicFriction;

		if (haveFriction)
		{
			const PxVec3 vrel = linVel0 - linVel1;//V3Sub(linVel0, linVel1);
												  //const Vec3V normal = Vec3V_From_PxVec3_Aligned(buffer.contacts[c.contactPatches[c.correlationListHeads[i]].start].normal);

			const float orthoThreshold = 0.70710678f;
			const float p1 = 0.0001f;
			// fallback: normal.cross((1,0,0)) or normal.cross((0,0,1))
			const float normalX = normal.x;//V3GetX(normal);
			const float normalY = normal.y;//V3GetY(normal);
			const float normalZ = normal.z;//V3GetZ(normal);

			PxVec3 t0Fallback1(0.f, -normalZ, normalY);//V3Merge(zero, FNeg(normalZ), normalY);
			PxVec3 t0Fallback2(-normalY, normalX, 0.f);//V3Merge(FNeg(normalY), normalX, zero) ;
			PxVec3 t0Fallback = orthoThreshold > PxAbs(normalX) ? t0Fallback1 : t0Fallback2;//V3Sel(FIsGrtr(orthoThreshold, FAbs(normalX)), t0Fallback1, t0Fallback2);

			PxVec3 t0 = vrel - normal *(normal.dot(vrel));//V3Sub(vrel, V3Scale(normal, V3Dot(normal, vrel)));
			t0 = t0.magnitudeSquared() > p1 ? t0 : t0Fallback;//V3Sel(FIsGrtr(V3LengthSq(t0), p1), t0, t0Fallback);
			t0.normalize();

			const PxVec3 t1 = normal.cross(t0);

			params.blockFrictionHeader->broken[threadIndex] = 0;

			params.blockFrictionHeader->frictionNormals[0][threadIndex] = make_float4(t0.x, t0.y, t0.z, 0.f);
			params.blockFrictionHeader->frictionNormals[1][threadIndex] = make_float4(t1.x, t1.y, t1.z, 0.f);

			for (PxU32 j = 0; j < aCount; j++)
			{
				PxgBlockSolverContactFriction* PX_RESTRICT f0 = &params.blockFrictions[2 * j];
				PxgBlockSolverContactFriction* PX_RESTRICT f1 = &params.blockFrictions[2 * j + 1];

				const float4 body0Anchor4 = fAnchor.body0Anchors[j][threadIndex];
				const float4 body1Anchor4 = fAnchor.body1Anchors[j][threadIndex];

				const PxVec3 body0Anchor(body0Anchor4.x, body0Anchor4.y, body0Anchor4.z);
				const PxVec3 body1Anchor(body1Anchor4.x, body1Anchor4.y, body1Anchor4.z);

				const PxVec3 ra = b0.body2World.q.rotate(body0Anchor);//QuatRotate(bodyFrame0q, body0Anchor);
				const PxVec3 rb = b1.body2World.q.rotate(body1Anchor);//QuatRotate(bodyFrame1q, body1Anchor);

				const PxVec3 error = (ra + bodyFrame0p) - (rb + bodyFrame1p);//V3Sub(V3Add(ra, bodyFrame0p), V3Add(rb, bodyFrame1p));

				PxU32 index = perPointFriction ? frictionPatch.contactID[j][threadIndex] : 0;

				const float4 targetVel4 = contacts[index].targetVel_maxImpulseW[threadIndex];
				const PxVec3 tvel(targetVel4.x, targetVel4.y, targetVel4.z);

				{
					PxVec3 raXn = ra.cross(t0);//V3Cross(ra, t0Cross);
					PxVec3 rbXn = rb.cross(t0);//V3Cross(rb, t0Cross);

					raXn.x = PxAbs(raXn.x) < solverOffsetSlop ? 0.f : raXn.x;
					raXn.y = PxAbs(raXn.y) < solverOffsetSlop ? 0.f : raXn.y;
					raXn.z = PxAbs(raXn.z) < solverOffsetSlop ? 0.f : raXn.z;

					rbXn.x = PxAbs(rbXn.x) < solverOffsetSlop ? 0.f : rbXn.x;
					rbXn.y = PxAbs(rbXn.y) < solverOffsetSlop ? 0.f : rbXn.y;
					rbXn.z = PxAbs(rbXn.z) < solverOffsetSlop ? 0.f : rbXn.z;
					
					const Cm::UnAlignedSpatialVector response0 = createImpulseResponseVector(b0, Cm::UnAlignedSpatialVector(raXn, t0));
					const Cm::UnAlignedSpatialVector response1 = createImpulseResponseVector(b1, Cm::UnAlignedSpatialVector(-rbXn, -t0));

					const Cm::UnAlignedSpatialVector deltaV0 = getImpulseResponse(b0, Cm::UnAlignedSpatialVector(t0, raXn).scale(lin0X_ang0Y_lin1Z_ang1W.x, lin0X_ang0Y_lin1Z_ang1W.y));
					const Cm::UnAlignedSpatialVector deltaV1 = getImpulseResponse(b1, Cm::UnAlignedSpatialVector(-t0, -rbXn).scale(lin0X_ang0Y_lin1Z_ang1W.z, lin0X_ang0Y_lin1Z_ang1W.w));

					//printf("%i: deltaV0 = (%f, %f, %f, %f, %f, %f)\n", b0.linkIndex, deltaV0.top.x, deltaV0.top.y, deltaV0.top.z,
					//	deltaV0.bottom.x, deltaV0.bottom.y, deltaV0.bottom.z);

					const float resp0 = deltaV0.dot(response0);
					const float resp1 = deltaV1.dot(response1);

					float targetVel = tvel.dot(t0);//V3Dot(V3LoadU(buffer.contacts[index].targetVel), t0);

					if (!b0.islandNodeIndex.isArticulation())
						targetVel -= projectVelocity(b0, Cm::UnAlignedSpatialVector(raXn, t0));
					if (!b1.islandNodeIndex.isArticulation())
						targetVel += projectVelocity(b1, Cm::UnAlignedSpatialVector(rbXn, t0));

					// Storing resp0 and resp1, separately.
					const float bias = t0.dot(error) * invDt;
					f0->raXn_bias[threadIndex] = make_float4(response0.top.x, response0.top.y, response0.top.z, bias);
					f0->rbXn_targetVelW[threadIndex] = make_float4(-response1.top.x, -response1.top.y, -response1.top.z, targetVel);
					f0->appliedForce[threadIndex] = 0.f;
					f0->resp0[threadIndex] = resp0;
					f0->resp1[threadIndex] = resp1;

					//printf("%i: Normal = (%f, %f, %f), raXn = (%f, %f, %f), Friction velMultiplier = %f, biasedErr = %f, targetVel = %f\n", 
					//	b0.linkIndex, t0.x, t0.y, t0.z, raXn.x, raXn.y, raXn.z, velMultiplier, t0.dot(error)*invDt, targetVel);

					artiResponse->deltaRALin_x[threadIndex] = deltaV0.bottom.x;
					artiResponse->deltaRALin_y[threadIndex] = deltaV0.bottom.y;
					artiResponse->deltaRALin_z[threadIndex] = deltaV0.bottom.z;
					artiResponse->deltaRAAng_x[threadIndex] = deltaV0.top.x;
					artiResponse->deltaRAAng_y[threadIndex] = deltaV0.top.y;
					artiResponse->deltaRAAng_z[threadIndex] = deltaV0.top.z;

					artiResponse->deltaRBLin_x[threadIndex] = deltaV1.bottom.x;
					artiResponse->deltaRBLin_y[threadIndex] = deltaV1.bottom.y;
					artiResponse->deltaRBLin_z[threadIndex] = deltaV1.bottom.z;
					artiResponse->deltaRBAng_x[threadIndex] = deltaV1.top.x;
					artiResponse->deltaRBAng_y[threadIndex] = deltaV1.top.y;
					artiResponse->deltaRBAng_z[threadIndex] = deltaV1.top.z;
					artiResponse++;
				}

				{

					PxVec3 raXn = ra.cross(t1);//V3Cross(ra, t1Cross);
					PxVec3 rbXn = rb.cross(t1);//V3Cross(rb, t1Cross);

					raXn.x = PxAbs(raXn.x) < solverOffsetSlop ? 0.f : raXn.x;
					raXn.y = PxAbs(raXn.y) < solverOffsetSlop ? 0.f : raXn.y;
					raXn.z = PxAbs(raXn.z) < solverOffsetSlop ? 0.f : raXn.z;

					rbXn.x = PxAbs(rbXn.x) < solverOffsetSlop ? 0.f : rbXn.x;
					rbXn.y = PxAbs(rbXn.y) < solverOffsetSlop ? 0.f : rbXn.y;
					rbXn.z = PxAbs(rbXn.z) < solverOffsetSlop ? 0.f : rbXn.z;

					const Cm::UnAlignedSpatialVector response0 = createImpulseResponseVector(b0, Cm::UnAlignedSpatialVector(raXn, t1));
					const Cm::UnAlignedSpatialVector response1 = createImpulseResponseVector(b1, Cm::UnAlignedSpatialVector(-rbXn, -t1));

					const Cm::UnAlignedSpatialVector deltaV0 = getImpulseResponse(b0, Cm::UnAlignedSpatialVector(t1, raXn).scale(lin0X_ang0Y_lin1Z_ang1W.x, lin0X_ang0Y_lin1Z_ang1W.y));
					const Cm::UnAlignedSpatialVector deltaV1 = getImpulseResponse(b1, Cm::UnAlignedSpatialVector(-t1, -rbXn).scale(lin0X_ang0Y_lin1Z_ang1W.z, lin0X_ang0Y_lin1Z_ang1W.w));

					const float resp0 = deltaV0.dot(response0);
					const float resp1 = deltaV1.dot(response1);

					float targetVel = tvel.dot(t1);//V3Dot(V3LoadU(buffer.contacts[index].targetVel), t0);

					if (!b0.islandNodeIndex.isArticulation())
						targetVel -= projectVelocity(b0, Cm::UnAlignedSpatialVector(raXn, t1));
					if (!b1.islandNodeIndex.isArticulation())
						targetVel += projectVelocity(b1, Cm::UnAlignedSpatialVector(rbXn, t1));

					// Storing resp0 and resp1, separately.
					const float bias = t1.dot(error) * invDt;
					f1->raXn_bias[threadIndex] = make_float4(response0.top.x, response0.top.y, response0.top.z, bias);
					f1->rbXn_targetVelW[threadIndex] = make_float4(-response1.top.x, -response1.top.y, -response1.top.z, targetVel);
					f1->appliedForce[threadIndex] = 0.f;
					f1->resp0[threadIndex] = resp0;
					f1->resp1[threadIndex] = resp1;

					artiResponse->deltaRALin_x[threadIndex] = deltaV0.bottom.x;
					artiResponse->deltaRALin_y[threadIndex] = deltaV0.bottom.y;
					artiResponse->deltaRALin_z[threadIndex] = deltaV0.bottom.z;
					artiResponse->deltaRAAng_x[threadIndex] = deltaV0.top.x;
					artiResponse->deltaRAAng_y[threadIndex] = deltaV0.top.y;
					artiResponse->deltaRAAng_z[threadIndex] = deltaV0.top.z;

					artiResponse->deltaRBLin_x[threadIndex] = deltaV1.bottom.x;
					artiResponse->deltaRBLin_y[threadIndex] = deltaV1.bottom.y;
					artiResponse->deltaRBLin_z[threadIndex] = deltaV1.bottom.z;
					artiResponse->deltaRBAng_x[threadIndex] = deltaV1.top.x;
					artiResponse->deltaRBAng_y[threadIndex] = deltaV1.top.y;
					artiResponse->deltaRBAng_z[threadIndex] = deltaV1.top.z;
					artiResponse++;
				}
			}
		}

		frictionPatchWritebackAddrIndex++;
	}
}


//for TGS solver(invDt is the step invDt and invTotalDt is the total invDt)
static __device__ void setupFinalizeExtSolverConstraintsBlock(PxgBlockContactData& contactData,
	const PxgBlockContactPoint* PX_RESTRICT contacts,
	const PxU32 contactCount,
	const PxgBlockFrictionPatch& frictionPatch,
	const PxgBlockFrictionAnchorPatch& fAnchor,
	const PxgSolverExtBody2& b0,
	const PxgSolverExtBody2& b1,
	const PxReal invDt,
	const PxReal stepDt,
	const PxReal invTotalDt,
	const PxReal bounceThresholdF32,
	const PxReal biasCoefficient,
	const PxU32 threadIndex,
	PxU32 forceWritebackBufferOffset,
	const bool perPointFriction,
	PxgTGSContactBlockParams& params,
	PxgArticulationBlockResponse* PX_RESTRICT articulationResponses,
	const PxgMaterialContactData& data,
	const PxReal ccdMaxSeparationThreshold,
	const PxReal solverOffsetSlop,
	const float2 torsionalFrictionData,
	const PxReal totalDt)
{
	using namespace physx;
	// NOTE II: the friction patches are sparse (some of them have no contact patches, and
	// therefore did not get written back to the cache) but the patch addresses are dense,
	// corresponding to valid patches

	/*const float4 data_ = reinterpret_cast<float4&>(contactData.contactData[threadIndex]);
	const PxgMaterialContactData data = reinterpret_cast<const PxgMaterialContactData&>(data_);*/

	PxU8 flags = data.mSolverFlags;

	const float restDistance = data.restDistance;//FLoad(n.restDistance); 

	const PxVec3& bodyFrame0p = b0.body2World.p;

	const PxAlignedQuat& bodyFrame1q = b1.body2World.q;//QuatVLoadU(&bodyFrame1.q.x);
	const PxVec3& bodyFrame1p = b1.body2World.p;

	uint32_t frictionPatchWritebackAddrIndex = 0;
	uint32_t contactWritebackCount = 0;

	const PxReal cfm = PxMax(b0.cfm, b1.cfm);

	const float maxPenBias = fmaxf(b0.penBiasClamp, b1.penBiasClamp);
	//printf("%i: MaxPenBias = %f\n", threadIndex, maxPenBias);

	const float p8 = PxMin(0.8f, biasCoefficient);
	const float bounceThreshold = bounceThresholdF32;

	const float invDtp8 = invDt * p8;//FMul(invDt, p8);

	const float4 lin0X_ang0Y_lin1Z_ang1W = contactData.mInvMassScale[threadIndex].lin0X_ang0Y_lin1Z_ang1W;

	const PxVec3& linVel0 = b0.velocity.bottom;
	const PxVec3& linVel1 = b1.velocity.bottom;
	const PxVec3& angVel0 = b0.velocity.top;
	const PxVec3& angVel1 = b1.velocity.top;

	{
		const PxU32 aCount = frictionPatch.anchorCount[threadIndex];
		PxU32 anchorCount = aCount * 2;

		//If we have just 1 anchor and torsional settings are enabled, we have torsional friction.
		const bool hasTorsional = (torsionalFrictionData.x != 0.f || torsionalFrictionData.y != 0.f) && aCount == 1;

		if (hasTorsional)
			anchorCount++;

		//shared memory counter for max contacts in the friction patch...

		if (contactCount == 0)
		{
			//printf("%i: Not Prepping %i contacts\n", threadIndex, contactCount);
			params.blockFrictionHeader->numFrictionConstr[threadIndex] = 0;
			params.blockContactHeader->numNormalConstr[threadIndex] = 0;
			params.blockContactHeader->forceWritebackOffset[threadIndex] = 0xffffffff;
			params.blockContactHeader->invMass0_1_angDom0_1[threadIndex] = make_float4(0.0f);
			return;
		}

		//printf("Prepping %i contacts!\n", contactCount);
		PxReal maxImpulse = PxMin(b0.maxImpulse, b1.maxImpulse);

		params.blockContactHeader->forceWritebackOffset[threadIndex] = forceWritebackBufferOffset;

		params.blockContactHeader->flags[threadIndex] = flags;
		
		params.blockContactHeader->maxPenBias[threadIndex] = maxPenBias;

		//KS - despite the variable name, this only stores the invMassScale terms for extended contacts
		params.blockContactHeader->invMass0_1_angDom0_1[threadIndex] = make_float4(lin0X_ang0Y_lin1Z_ang1W.x, lin0X_ang0Y_lin1Z_ang1W.z, lin0X_ang0Y_lin1Z_ang1W.y, lin0X_ang0Y_lin1Z_ang1W.w);

		const float4 normal4_rW = contactData.normal_restitutionW[threadIndex];
		const PxVec3 normal(normal4_rW.x, normal4_rW.y, normal4_rW.z);
		const PxReal restitution = normal4_rW.w;
		const PxReal normalLenSq = normal.magnitudeSquared();//V3LengthSq(normal);
		const PxReal norVel0 = normal.dot(linVel0);//V3Dot(normal, linVel0);
		const PxReal norVel1 = normal.dot(linVel1);//V3Dot(normal, linVel1);
		const PxReal linNorVel = norVel0 - norVel1;
		const PxReal damping = contactData.damping[threadIndex];

		params.blockFrictionHeader->biasCoefficient[threadIndex] = invDt;

		params.blockContactHeader->restitutionXdt[threadIndex] = restitution * stepDt;
		params.blockContactHeader->cfm[threadIndex] = cfm;
		params.blockContactHeader->p8[threadIndex] = p8;

		PxgArticulationBlockResponse* PX_RESTRICT artiResponse = articulationResponses;

		//printf("%i: Prepping %i contacts\n", threadIndex, contactCount);

		PxReal normalForce = 0.f;

		PxReal minPen = 0.f;
		
		for (PxU32 j = 0; j < contactCount; j++)
		{
			//printf("Contact %i\n", j);
			const PxgBlockContactPoint& contact = contacts[j];

			PxgTGSBlockSolverContactPoint* solverContact = &params.blockContactPoints[j];

			//const Vec3V targetVel = Vec3V_From_PxVec3_Aligned(contact.targetVel);

			const float4 point4_separationW = contact.point_separationW[threadIndex];
			const PxVec3 point(point4_separationW.x, point4_separationW.y, point4_separationW.z);
			const float separation = point4_separationW.w;

			const float4 targetVel4_maxImpulseW = contact.targetVel_maxImpulseW[threadIndex];
			const PxVec3 targetVel(targetVel4_maxImpulseW.x, targetVel4_maxImpulseW.y, targetVel4_maxImpulseW.z);

			const float cTargetVel = normal.dot(targetVel);//V3Dot(normal, V3LoadA(contact.targetVel));

			const PxVec3 ra = point - bodyFrame0p;//V3Sub(point, bodyFrame0p);
			const PxVec3 rb = point - bodyFrame1p;//V3Sub(point, bodyFrame1p);

			PxVec3 raXn = ra.cross(normal);
			PxVec3 rbXn = rb.cross(normal);

			float angNorVel = raXn.dot(angVel0) - rbXn.dot(angVel1);

			const PxReal slop = solverOffsetSlop * PxMax(linNorVel == 0.f ? 1.f : angNorVel / linNorVel, 1.f);

			raXn.x = PxAbs(raXn.x) < slop ? 0.f : raXn.x;
			raXn.y = PxAbs(raXn.y) < slop ? 0.f : raXn.y;
			raXn.z = PxAbs(raXn.z) < slop ? 0.f : raXn.z;

			rbXn.x = PxAbs(rbXn.x) < slop ? 0.f : rbXn.x;
			rbXn.y = PxAbs(rbXn.y) < slop ? 0.f : rbXn.y;
			rbXn.z = PxAbs(rbXn.z) < slop ? 0.f : rbXn.z;

			//printf("%i: normal = (%f, %f, %f), raXn = (%f, %f, %f), rbXn = (%f, %f, %f)\n", threadIndex, normal.x, normal.y, normal.z, raXn.x, raXn.y, raXn.z,
			//	rbXn.x, rbXn.y, rbXn.z);

			const Cm::UnAlignedSpatialVector response0 = createImpulseResponseVector(b0, Cm::UnAlignedSpatialVector(raXn, normal));
			const Cm::UnAlignedSpatialVector response1 = createImpulseResponseVector(b1, Cm::UnAlignedSpatialVector(-rbXn, -normal));

			//printf("%i: response0 = (%f, %f, %f, %f, %f, %f), response1 = (%f, %f, %f, %f, %f, %f)\n", 
			//	threadIndex, response0.top.x, response0.top.y, response0.top.z, response0.bottom.x, response0.bottom.y, response0.bottom.z,
			//	response1.top.x, response1.top.y, response1.top.z, response1.bottom.x, response1.bottom.y, response1.bottom.z);

			const Cm::UnAlignedSpatialVector deltaV0 = getImpulseResponse(b0, Cm::UnAlignedSpatialVector(normal, raXn).scale(lin0X_ang0Y_lin1Z_ang1W.x, lin0X_ang0Y_lin1Z_ang1W.y));
			const Cm::UnAlignedSpatialVector deltaV1 = getImpulseResponse(b1, Cm::UnAlignedSpatialVector(-normal, -rbXn).scale(lin0X_ang0Y_lin1Z_ang1W.z, lin0X_ang0Y_lin1Z_ang1W.w));

			//printf("%i: DeltaV0 = (%f, %f, %f, %f, %f, %f), deltaV1 = (%f, %f, %f, %f, %f, %f)\n", threadIndex, deltaV0.top.x, deltaV0.top.y, deltaV0.top.z,
			//	deltaV0.bottom.x, deltaV0.bottom.y, deltaV0.bottom.z, deltaV1.top.x, deltaV1.top.y, deltaV1.top.z,
			//	deltaV1.bottom.x, deltaV1.bottom.y, deltaV1.bottom.z);

			const float resp0 = deltaV0.dot(response0);
			const float resp1 = deltaV1.dot(response1);

			const float unitResponse = resp0 + resp1;//FAdd(resp0, resp1);

			const float vrel1 = norVel0 + projectAngular(b0, raXn);//FAdd(norVel0, V3Dot(raXn, angVel0));
			const float vrel2 = norVel1 + projectAngular(b1, rbXn);//FAdd(norVel1, V3Dot(rbXn, angVel1));
			const float vrel = vrel1 - vrel2;//FSub(vrel1, vrel2);

			//printf("%i: vrel = %f, vrel1 = %f, vrel2 = %f\n", threadIndex, vrel, vrel1, vrel2);

			const float penetration = separation - restDistance;
			const bool isSeparated = (penetration > 0.0f);
			const float penetrationInvDt = penetration * invTotalDt;

			float scaledBias, velMultiplier;

			const float recipResponse = (unitResponse > 0.f) ? (1.f / (unitResponse + cfm)) : 0.f;

			// To compute velMultiplier and scaledBias every sub-timestep or iteration, additional data is stored:
			// coeff0, coeff1. Using coeff0 and coeff1 produces the same results as the previous implementation when
			// mass-splitting does not occur.

			PxReal compliantContactCoeff = 0.f;

			if (restitution < 0.f)
			{
				const bool collidingWithVrel = ((-vrel * totalDt) > penetration); // Note: using totalDt here instead of penetrationInvDt because the latter has a fudge factor if there are velocity iterations
				compliantContactCoeff = computeCompliantContactCoefficientsTGS(stepDt, flags, restitution, damping, unitResponse, recipResponse,
					isSeparated, collidingWithVrel, velMultiplier, scaledBias);
			}
			else
			{
				velMultiplier = recipResponse;
				scaledBias = isSeparated ? -invDt : -invDtp8;
			}

			solverContact->resp0[threadIndex] = resp0;
			solverContact->resp1[threadIndex] = resp1;

			minPen = PxMin(penetration, minPen);

			//const float penetrationInvDtPt8 = fmaxf(maxPenBias, penetration*invDtp8);//FMax(maxPenBias, FMul(penetration, invDtp8));

			//printf("%i: Velmultipler =  %f, penetrationInvDtPt8 = %f, invDt %f, vrel %f\n", threadIndex, velMultiplier, penetrationInvDt, invTotalDt, vrel);

			const bool isGreater2 = ((restitution > 1e-5f) && (bounceThreshold > vrel)) && ((-vrel) > penetrationInvDt);

			float totalError = penetration;

			float targetVelocity = cTargetVel + isGreater2 ? (-vrel)*restitution : 0.f;//FAdd(cTargetVel, FSel(isGreater2, FMul(FNeg(sumVRel), restitution), zero));

			const PxReal deltaF = fmaxf((targetVelocity + (-penetrationInvDt - vrel)) * velMultiplier, 0.f);
			normalForce += deltaF;

			if (b0.isKinematic)
				targetVelocity -= vrel1;
			if (b1.isKinematic)
				targetVelocity += vrel2;

			if (isGreater2)
			{
				const PxReal ratio = totalDt + penetration/vrel;
				totalError += ratio * targetVelocity;
			}

			solverContact->raXn_extraCoeff[threadIndex] = make_float4(response0.top.x, response0.top.y, response0.top.z, compliantContactCoeff);
			solverContact->rbXn_targetVelW[threadIndex] = make_float4(-response1.top.x, -response1.top.y, -response1.top.z, targetVelocity);
			solverContact->appliedForce[threadIndex] = 0.f;
			solverContact->maxImpulse[threadIndex] = PxMin(maxImpulse, targetVel4_maxImpulseW.w);
			solverContact->separation[threadIndex] = totalError;
			solverContact->biasCoefficient[threadIndex] = scaledBias;

			artiResponse->deltaRALin_x[threadIndex] = deltaV0.bottom.x;
			artiResponse->deltaRALin_y[threadIndex] = deltaV0.bottom.y;
			artiResponse->deltaRALin_z[threadIndex] = deltaV0.bottom.z;
			artiResponse->deltaRAAng_x[threadIndex] = deltaV0.top.x;
			artiResponse->deltaRAAng_y[threadIndex] = deltaV0.top.y;
			artiResponse->deltaRAAng_z[threadIndex] = deltaV0.top.z;
			artiResponse->deltaRBLin_x[threadIndex] = deltaV1.bottom.x;
			artiResponse->deltaRBLin_y[threadIndex] = deltaV1.bottom.y;
			artiResponse->deltaRBLin_z[threadIndex] = deltaV1.bottom.z;
			artiResponse->deltaRBAng_x[threadIndex] = deltaV1.top.x;
			artiResponse->deltaRBAng_y[threadIndex] = deltaV1.top.y;
			artiResponse->deltaRBAng_z[threadIndex] = deltaV1.top.z;
			artiResponse++;
		}

		params.blockContactHeader->minNormalForce[threadIndex] = (normalForce / contactCount)*0.25f;

	
		contactWritebackCount += contactCount;

		
		PxReal frictionScale = (aCount == 2) ? 0.5f : 1.f;

		const PxReal staticFriction = data.staticFriction*frictionScale;
		const PxReal dynamicFriction = data.dynamicFriction*frictionScale;

		const bool haveFriction = (anchorCount != 0);//PX_IR(n.staticFriction) > 0 || PX_IR(n.dynamicFriction) > 0;
		params.blockContactHeader->numNormalConstr[threadIndex] = contactCount;
		params.blockFrictionHeader->numFrictionConstr[threadIndex] = anchorCount;

		//header->type				= type;

		params.blockContactHeader->normal_staticFriction[threadIndex] = make_float4(normal.x, normal.y, normal.z, staticFriction);
		params.blockFrictionHeader->dynamicFriction[threadIndex] = dynamicFriction;

		if (haveFriction)
		{
			const PxVec3 vrel = linVel0 - linVel1;//V3Sub(linVel0, linVel1);
												  //const Vec3V normal = Vec3V_From_PxVec3_Aligned(buffer.contacts[c.contactPatches[c.correlationListHeads[i]].start].normal);

			const float orthoThreshold = 0.70710678f;
			const float p1 = 0.0001f;
			// fallback: normal.cross((1,0,0)) or normal.cross((0,0,1))
			const float normalX = normal.x;//V3GetX(normal);
			const float normalY = normal.y;//V3GetY(normal);
			const float normalZ = normal.z;//V3GetZ(normal);

			PxVec3 t0Fallback1(0.f, -normalZ, normalY);//V3Merge(zero, FNeg(normalZ), normalY);
			PxVec3 t0Fallback2(-normalY, normalX, 0.f);//V3Merge(FNeg(normalY), normalX, zero) ;
			PxVec3 t0Fallback = orthoThreshold > PxAbs(normalX) ? t0Fallback1 : t0Fallback2;//V3Sel(FIsGrtr(orthoThreshold, FAbs(normalX)), t0Fallback1, t0Fallback2);

			PxVec3 t0 = vrel - normal * (normal.dot(vrel));//V3Sub(vrel, V3Scale(normal, V3Dot(normal, vrel)));
			t0 = t0.magnitudeSquared() > p1 ? t0 : t0Fallback;//V3Sel(FIsGrtr(V3LengthSq(t0), p1), t0, t0Fallback);
			t0.normalize();

			const PxVec3 t1 = normal.cross(t0);

			params.blockFrictionHeader->broken[threadIndex] = 0;

			params.blockFrictionHeader->frictionNormals[0][threadIndex] = make_float4(t0.x, t0.y, t0.z, 0.f);
			params.blockFrictionHeader->frictionNormals[1][threadIndex] = make_float4(t1.x, t1.y, t1.z, 0.f);

			const PxVec3 relTr = bodyFrame0p - bodyFrame1p;

			for (PxU32 j = 0; j < aCount; j++)
			{
				PxgTGSBlockSolverContactFriction* PX_RESTRICT f0 = &params.blockFrictions[2 * j];
				PxgTGSBlockSolverContactFriction* PX_RESTRICT f1 = &params.blockFrictions[2 * j + 1];

				const float4 body0Anchor4 = fAnchor.body0Anchors[j][threadIndex];
				const float4 body1Anchor4 = fAnchor.body1Anchors[j][threadIndex];

				const PxVec3 body0Anchor(body0Anchor4.x, body0Anchor4.y, body0Anchor4.z);
				const PxVec3 body1Anchor(body1Anchor4.x, body1Anchor4.y, body1Anchor4.z);

				const PxVec3 ra = b0.body2World.q.rotate(body0Anchor);//QuatRotate(bodyFrame0q, body0Anchor);
				const PxVec3 rb = b1.body2World.q.rotate(body1Anchor);//QuatRotate(bodyFrame1q, body1Anchor);

				//const PxVec3 error = (ra + bodyFrame0p) - (rb + bodyFrame1p);//V3Sub(V3Add(ra, bodyFrame0p), V3Add(rb, bodyFrame1p));

				PxU32 index = perPointFriction ? frictionPatch.contactID[j][threadIndex] : 0;

				const float4 targetVel4 = contacts[index].targetVel_maxImpulseW[threadIndex];
				const PxVec3 tvel(targetVel4.x, targetVel4.y, targetVel4.z);

				const PxVec3 error = (ra - rb) + relTr;

				{
					PxVec3 raXn = ra.cross(t0);//V3Cross(ra, t0Cross);
					PxVec3 rbXn = rb.cross(t0);//V3Cross(rb, t0Cross);

					raXn.x = PxAbs(raXn.x) < solverOffsetSlop ? 0.f : raXn.x;
					raXn.y = PxAbs(raXn.y) < solverOffsetSlop ? 0.f : raXn.y;
					raXn.z = PxAbs(raXn.z) < solverOffsetSlop ? 0.f : raXn.z;

					rbXn.x = PxAbs(rbXn.x) < solverOffsetSlop ? 0.f : rbXn.x;
					rbXn.y = PxAbs(rbXn.y) < solverOffsetSlop ? 0.f : rbXn.y;
					rbXn.z = PxAbs(rbXn.z) < solverOffsetSlop ? 0.f : rbXn.z;

					const Cm::UnAlignedSpatialVector response0 = createImpulseResponseVector(b0, Cm::UnAlignedSpatialVector(raXn, t0));
					const Cm::UnAlignedSpatialVector response1 = createImpulseResponseVector(b1, Cm::UnAlignedSpatialVector(-rbXn, -t0));

					const Cm::UnAlignedSpatialVector deltaV0 = getImpulseResponse(b0, Cm::UnAlignedSpatialVector(t0, raXn).scale(lin0X_ang0Y_lin1Z_ang1W.x, lin0X_ang0Y_lin1Z_ang1W.y));
					const Cm::UnAlignedSpatialVector deltaV1 = getImpulseResponse(b1, Cm::UnAlignedSpatialVector(-t0, -rbXn).scale(lin0X_ang0Y_lin1Z_ang1W.z, lin0X_ang0Y_lin1Z_ang1W.w));

					//printf("%i: deltaV0 = (%f, %f, %f, %f, %f, %f)\n", b0.linkIndex, deltaV0.top.x, deltaV0.top.y, deltaV0.top.z,
					//	deltaV0.bottom.x, deltaV0.bottom.y, deltaV0.bottom.z);

					const float resp0 = deltaV0.dot(response0);
					const float resp1 = deltaV1.dot(response1);

					float targetVel = tvel.dot(t0);//V3Dot(V3LoadU(buffer.contacts[index].targetVel), t0);

					if (b0.isKinematic)
						targetVel -= projectVelocity(b0, Cm::UnAlignedSpatialVector(raXn, t0));
					if (b1.isKinematic)
						targetVel += projectVelocity(b1, Cm::UnAlignedSpatialVector(rbXn, t0));

					//printf("%i: Normal = (%f, %f, %f), raXn = (%f, %f, %f), Friction velMultiplier = %f, biasedErr = %f, targetVel = %f\n", 
					//	b0.linkIndex, t0.x, t0.y, t0.z, raXn.x, raXn.y, raXn.z, velMultiplier, t0.dot(error)*invDt, targetVel);
					
					const float frictionError = error.dot(t0);

					f0->resp0[threadIndex] = resp0;
					f0->resp1[threadIndex] = resp1;
					f0->raXn_error[threadIndex] = make_float4(response0.top.x, response0.top.y, response0.top.z, frictionError);
					f0->rbXn_targetVelW[threadIndex] = make_float4(-response1.top.x, -response1.top.y, -response1.top.z, targetVel);
					f0->appliedForce[threadIndex] = 0.f;

					artiResponse->deltaRALin_x[threadIndex] = deltaV0.bottom.x;
					artiResponse->deltaRALin_y[threadIndex] = deltaV0.bottom.y;
					artiResponse->deltaRALin_z[threadIndex] = deltaV0.bottom.z;
					artiResponse->deltaRAAng_x[threadIndex] = deltaV0.top.x;
					artiResponse->deltaRAAng_y[threadIndex] = deltaV0.top.y;
					artiResponse->deltaRAAng_z[threadIndex] = deltaV0.top.z;

					artiResponse->deltaRBLin_x[threadIndex] = deltaV1.bottom.x;
					artiResponse->deltaRBLin_y[threadIndex] = deltaV1.bottom.y;
					artiResponse->deltaRBLin_z[threadIndex] = deltaV1.bottom.z;
					artiResponse->deltaRBAng_x[threadIndex] = deltaV1.top.x;
					artiResponse->deltaRBAng_y[threadIndex] = deltaV1.top.y;
					artiResponse->deltaRBAng_z[threadIndex] = deltaV1.top.z;
					artiResponse++;
				}

				{

					PxVec3 raXn = ra.cross(t1);//V3Cross(ra, t1Cross);
					PxVec3 rbXn = rb.cross(t1);//V3Cross(rb, t1Cross);

					raXn.x = PxAbs(raXn.x) < solverOffsetSlop ? 0.f : raXn.x;
					raXn.y = PxAbs(raXn.y) < solverOffsetSlop ? 0.f : raXn.y;
					raXn.z = PxAbs(raXn.z) < solverOffsetSlop ? 0.f : raXn.z;

					rbXn.x = PxAbs(rbXn.x) < solverOffsetSlop ? 0.f : rbXn.x;
					rbXn.y = PxAbs(rbXn.y) < solverOffsetSlop ? 0.f : rbXn.y;
					rbXn.z = PxAbs(rbXn.z) < solverOffsetSlop ? 0.f : rbXn.z;

					const Cm::UnAlignedSpatialVector response0 = createImpulseResponseVector(b0, Cm::UnAlignedSpatialVector(raXn, t1));
					const Cm::UnAlignedSpatialVector response1 = createImpulseResponseVector(b1, Cm::UnAlignedSpatialVector(-rbXn, -t1));

					const Cm::UnAlignedSpatialVector deltaV0 = getImpulseResponse(b0, Cm::UnAlignedSpatialVector(t1, raXn).scale(lin0X_ang0Y_lin1Z_ang1W.x, lin0X_ang0Y_lin1Z_ang1W.y));
					const Cm::UnAlignedSpatialVector deltaV1 = getImpulseResponse(b1, Cm::UnAlignedSpatialVector(-t1, -rbXn).scale(lin0X_ang0Y_lin1Z_ang1W.z, lin0X_ang0Y_lin1Z_ang1W.w));

					const float resp0 = deltaV0.dot(response0);
					const float resp1 = deltaV1.dot(response1);

					float targetVel = tvel.dot(t1);//V3Dot(V3LoadU(buffer.contacts[index].targetVel), t0);

					if (b0.isKinematic)
						targetVel -= projectVelocity(b0, Cm::UnAlignedSpatialVector(raXn, t1));
					if (b1.isKinematic)
						targetVel += projectVelocity(b1, Cm::UnAlignedSpatialVector(rbXn, t1));

					//printf("%i: deltaV0 = (%f, %f, %f, %f, %f, %f)\n", b0.linkIndex, deltaV0.top.x, deltaV0.top.y, deltaV0.top.z,
					//	deltaV0.bottom.x, deltaV0.bottom.y, deltaV0.bottom.z);

					//printf("%i: Normal = (%f, %f, %f), raXn = (%f, %f, %f), Friction velMultiplier = %f, biasedErr = %f, targetVel = %f\n",
					//	b0.linkIndex, t0.x, t0.y, t0.z, raXn.x, raXn.y, raXn.z, velMultiplier, t1.dot(error)*invDt, targetVel);

					const float frictionError = error.dot(t1);
					f1->resp0[threadIndex] = resp0;
					f1->resp1[threadIndex] = resp1;
					f1->raXn_error[threadIndex] = make_float4(response0.top.x, response0.top.y, response0.top.z, frictionError);
					f1->rbXn_targetVelW[threadIndex] = make_float4(-response1.top.x, -response1.top.y, -response1.top.z, targetVel);
					f1->appliedForce[threadIndex] = 0.f;

					artiResponse->deltaRALin_x[threadIndex] = deltaV0.bottom.x;
					artiResponse->deltaRALin_y[threadIndex] = deltaV0.bottom.y;
					artiResponse->deltaRALin_z[threadIndex] = deltaV0.bottom.z;
					artiResponse->deltaRAAng_x[threadIndex] = deltaV0.top.x;
					artiResponse->deltaRAAng_y[threadIndex] = deltaV0.top.y;
					artiResponse->deltaRAAng_z[threadIndex] = deltaV0.top.z;
					artiResponse->deltaRBLin_x[threadIndex] = deltaV1.bottom.x;
					artiResponse->deltaRBLin_y[threadIndex] = deltaV1.bottom.y;
					artiResponse->deltaRBLin_z[threadIndex] = deltaV1.bottom.z;
					artiResponse->deltaRBAng_x[threadIndex] = deltaV1.top.x;
					artiResponse->deltaRBAng_y[threadIndex] = deltaV1.top.y;
					artiResponse->deltaRBAng_z[threadIndex] = deltaV1.top.z;
					artiResponse++;
				}
			}

			if (hasTorsional)
			{
				//Setup torsional...
				PxgTGSBlockSolverContactFriction* PX_RESTRICT f0 = &params.blockFrictions[2];

				const PxReal frictionScale = PxMax(torsionalFrictionData.y, PxSqrt(PxMax(0.f, -minPen)*torsionalFrictionData.x));

				params.blockFrictionHeader->torsionalFrictionScale[threadIndex] = frictionScale;


				const Cm::UnAlignedSpatialVector response0 = createImpulseResponseVector(b0, Cm::UnAlignedSpatialVector(normal, PxVec3(0.f)));
				const Cm::UnAlignedSpatialVector response1 = createImpulseResponseVector(b1, Cm::UnAlignedSpatialVector(-normal, PxVec3(0.f)));

				const Cm::UnAlignedSpatialVector deltaV0 = getImpulseResponse(b0, Cm::UnAlignedSpatialVector(PxVec3(0.f), normal).scale(lin0X_ang0Y_lin1Z_ang1W.x, lin0X_ang0Y_lin1Z_ang1W.y));
				const Cm::UnAlignedSpatialVector deltaV1 = getImpulseResponse(b1, Cm::UnAlignedSpatialVector(PxVec3(0.f), -normal).scale(lin0X_ang0Y_lin1Z_ang1W.z, lin0X_ang0Y_lin1Z_ang1W.w));

				//printf("%i: deltaV0 = (%f, %f, %f, %f, %f, %f)\n", b0.linkIndex, deltaV0.top.x, deltaV0.top.y, deltaV0.top.z,
				//	deltaV0.bottom.x, deltaV0.bottom.y, deltaV0.bottom.z);

				const float resp0 = deltaV0.dot(response0);
				const float resp1 = deltaV1.dot(response1);

				const float frictionError = 0.f;

				f0->resp0[threadIndex] = resp0;
				f0->resp1[threadIndex] = resp1;
				f0->raXn_error[threadIndex] = make_float4(response0.top.x, response0.top.y, response0.top.z, frictionError);
				f0->rbXn_targetVelW[threadIndex] = make_float4(-response1.top.x, -response1.top.y, -response1.top.z, 0.f);
				f0->appliedForce[threadIndex] = 0.f;

				artiResponse->deltaRALin_x[threadIndex] = deltaV0.bottom.x;
				artiResponse->deltaRALin_y[threadIndex] = deltaV0.bottom.y;
				artiResponse->deltaRALin_z[threadIndex] = deltaV0.bottom.z;
				artiResponse->deltaRAAng_x[threadIndex] = deltaV0.top.x;
				artiResponse->deltaRAAng_y[threadIndex] = deltaV0.top.y;
				artiResponse->deltaRAAng_z[threadIndex] = deltaV0.top.z;
				artiResponse->deltaRBLin_x[threadIndex] = deltaV1.bottom.x;
				artiResponse->deltaRBLin_y[threadIndex] = deltaV1.bottom.y;
				artiResponse->deltaRBLin_z[threadIndex] = deltaV1.bottom.z;
				artiResponse->deltaRBAng_x[threadIndex] = deltaV1.top.x;
				artiResponse->deltaRBAng_y[threadIndex] = deltaV1.top.y;
				artiResponse->deltaRBAng_z[threadIndex] = deltaV1.top.z;
				artiResponse++;
			}
		}

		frictionPatchWritebackAddrIndex++;
	}
}

template <typename ContactParams>
static __device__ void artiCreateFinalizeSolverContactsBlockGPU(PxgBlockContactData* contactData,
	const PxgBlockContactPoint* PX_RESTRICT contactPoints,
	PxgBlockFrictionPatch& frictionPatch,
	const PxgBlockFrictionPatch* PX_RESTRICT prevFrictionPatches,
	PxgBlockFrictionAnchorPatch& fAnchor,
	const PxgBlockFrictionAnchorPatch* PX_RESTRICT prevFrictionAnchors,
	const PxgBlockFrictionIndex* PX_RESTRICT prevFrictionIndices,
	const PxgSolverExtBody2& body0,
	const PxgSolverExtBody2& body1,
	const PxReal invDt,
	const PxReal dt,
	const PxReal invTotalDt,
	const PxReal bounceThresholdF32,
	const PxReal frictionOffsetThreshold,
	const PxReal correlationDistance,
	const PxReal biasCoefficient,
	const PxU32 threadIndex,
	PxU32 forceWritebackBufferOffset,
	ContactParams& params,
	PxgArticulationBlockResponse* PX_RESTRICT response,
	PxU32 totalEdges,
	PxU32 prevFrictionStartIndex,
	PxReal ccdMaxSeparation,
	PxReal solverOffsetSlop,
	const float2 torsionalFrictionData,
	const PxReal totalDt)
{
	//Load the body datas in using warp-wide programming...Each solver body data is 128 bytes long, so we can load both bodies in with 2 lots of 128 byte coalesced reads. 

	//printf("PxgMaterialContactData\n");
	const float4 data_ = reinterpret_cast<float4&>(contactData->contactData[threadIndex]);
	const PxgMaterialContactData data = reinterpret_cast<const PxgMaterialContactData&>(data_);

	const PxU32 nbContacts = data.mNumContacts;

	const float4 normal4 = contactData->normal_restitutionW[threadIndex];
	const PxVec3 normal(normal4.x, normal4.y, normal4.z);

	
	//printf("warpIndex %i idx %i Prepping %i contacts\n", warpIndex, threadIndex, nbContacts);

	//printf("correlatePatches\n");

	correlatePatches(frictionPatch, contactPoints, nbContacts, normal,
		body0.body2World, body1.body2World, PXC_SAME_NORMAL, threadIndex);

	PxU8 flags = data.mSolverFlags;
	bool perPointFriction = flags & (PxgSolverContactFlags::ePER_POINT_FRICTION);
	bool disableFriction = flags & PxgSolverContactFlags::eDISABLE_FRICTION;

	//KS - ensure that the friction patch broken bit is set to 0
	frictionPatch.broken[threadIndex] = 0;
	PxReal patchExtents;

	__syncwarp(); //Ensure writes from correlation are visible


	//printf("getFrictionPatches\n");
	if (!(perPointFriction || disableFriction))// || (solverBodyData0.islandNodeIndex & 2) || (solverBodyData1.islandNodeIndex & 2)))
	{
		getFrictionPatches(frictionPatch, fAnchor, prevFrictionIndices, prevFrictionStartIndex, prevFrictionPatches, 
			prevFrictionAnchors, data.prevFrictionPatchCount, body0.body2World, body1.body2World, correlationDistance, threadIndex, totalEdges, patchExtents,
			nbContacts);
	}

	if (!disableFriction)
		growPatches(frictionPatch, fAnchor, contactPoints, nbContacts, body0.body2World, body1.body2World, frictionOffsetThreshold + data.restDistance, threadIndex, patchExtents);

	//printf("setupFinalizeSolverConstraintsBlock\n");
	setupFinalizeExtSolverConstraintsBlock(*contactData, contactPoints, nbContacts,
		frictionPatch, fAnchor, body0, body1, invDt, dt, invTotalDt,
		bounceThresholdF32, biasCoefficient, threadIndex, forceWritebackBufferOffset, perPointFriction,
		params, response, data, ccdMaxSeparation, solverOffsetSlop, torsionalFrictionData, totalDt);

}

template<typename ContactParams, typename IterativeData>
__device__ void fillBlockParams(IterativeData& iterativeData, ContactParams& params,
	const PxU32 descIndexBatch, const PxU32 startConstraintIndex, const PxU32 startFrictionIndex)
{
	params.blockContactHeader = &iterativeData.blockContactHeaders[descIndexBatch];
	params.blockFrictionHeader = &iterativeData.blockFrictionHeaders[descIndexBatch];
	params.blockContactPoints = &iterativeData.blockContactPoints[startConstraintIndex];
	params.blockFrictions = &iterativeData.blockFrictions[startFrictionIndex];
}

template<typename ContactParams, typename IterativeData>
__device__ void artiContactConstraintBlockPrepare(
	PxgConstraintPrepareDesc* constraintPrepDesc,
	PxgSolverSharedDesc<IterativeData>* sharedDesc,
	const PxReal invDt, 
	const PxReal dt,
	const PxReal invTotalDt,
	const PxReal totalDt)
{
	PxgBlockWorkUnit* workUnits = constraintPrepDesc->blockWorkUnit;

	const PxU32 warpSize = WARP_SIZE;

	const PxU32 blockStride = blockDim.y;

	//This identifies which warp a specific thread is in, we treat all warps in all blocks as a flatten warp array
	//and we are going to index the work based on that
	const PxU32 warpIndex = blockIdx.x * blockStride + threadIdx.y;

	//This identifies which thread within a warp a specific thread is
	const PxU32 threadIndexInWarp = threadIdx.x&(warpSize - 1);

	assert(blockDim.x == WARP_SIZE);
	const PxU32 threadIndexInBlock = threadIdx.x + WARP_SIZE * threadIdx.y;

	//total numbers of warps in all blocks
	//const PxU32 totalNumWarps = blockStride * gridDim.x;

	//PxF32* baseForceStream = constraintPrepDesc->forceBuffer;

	const PxU32 totalPreviousEdges = constraintPrepDesc->totalPreviousEdges;
	const PxU32 totalCurrentEdges = constraintPrepDesc->totalCurrentEdges;
	//We need to check against both. Static contact batches are handled separately in the internal solver...
	const PxU32 nbContactBatches = constraintPrepDesc->numArtiContactBatches + constraintPrepDesc->numArtiStaticContactBatches;// + constraintPrepDesc->numArtiSelfContactBatches;
	const PxU32 nbContactBatchesWithSelf = nbContactBatches + constraintPrepDesc->numArtiSelfContactBatches;
	const PxU32 offset = constraintPrepDesc->numBatches; //Offset = number of non-articulation batches!

	__shared__ PxgSolverBodyData* solverBodyDatas;
	__shared__ PxgSolverTxIData* solverTxIDatas;

	__shared__ PxgArticulationBlockResponse* responses;
	__shared__ PxU32* batchIndices;
	__shared__ PxgBlockFrictionIndex* frictionIndices;
	__shared__ PxgBlockFrictionIndex* prevFrictionIndices;
	__shared__ PxgBlockContactPoint* contactBase;
	__shared__ PxgBlockConstraintBatch* constraintBatch;
	__shared__ PxgBlockContactData* contactCurrentPrepPool;

	__shared__ PxgBlockFrictionPatch* prevFrictionPatches;
	__shared__ PxgBlockFrictionPatch* currFrictionPatches;
	__shared__ PxgBlockFrictionAnchorPatch* prevFrictionAnchors;
	__shared__ PxgBlockFrictionAnchorPatch* currFrictionAnchors;

	__shared__ PxAlignedTransform* bodyFrames;

	
	if (threadIndexInBlock == 0)
	{
		solverBodyDatas = constraintPrepDesc->solverBodyDataPool;
		solverTxIDatas = constraintPrepDesc->solverBodyTxIDataPool;

		//contactHeaders = sharedDesc->iterativeData.blockContactHeaders;
		//frictionHeaders = sharedDesc->iterativeData.blockFrictionHeaders;
		//contactPoints = sharedDesc->iterativeData.blockContactPoints;
		responses = sharedDesc->iterativeData.artiResponse;
		//frictions = sharedDesc->iterativeData.blockFrictions;
		batchIndices = constraintPrepDesc->artiContactConstraintBatchIndices;
		frictionIndices = constraintPrepDesc->blockCurrentFrictionIndices;
		prevFrictionIndices = constraintPrepDesc->blockPreviousFrictionIndices;

		contactBase = constraintPrepDesc->blockContactPoints;
		constraintBatch = sharedDesc->iterativeData.blockConstraintBatch;
		contactCurrentPrepPool = constraintPrepDesc->blockContactCurrentPrepPool;
		currFrictionPatches = sharedDesc->blockCurrentFrictionPatches;
		prevFrictionPatches = sharedDesc->blockPreviousFrictionPatches;
		prevFrictionAnchors = constraintPrepDesc->blockPreviousAnchorPatches;
		currFrictionAnchors = constraintPrepDesc->blockCurrentAnchorPatches;
		bodyFrames = constraintPrepDesc->body2WorldPool;

		//printf("offset %i\n", offset);
	}

	__syncthreads();

	PxU32 i = warpIndex;
	//unsigned mask_nbContactBatches = __ballot_sync(FULL_MASK, i < nbContactBatches);
	if (i < nbContactBatchesWithSelf)
	{
		const PxU32 batchIndex = batchIndices[i] + offset;

		
		PxgBlockConstraintBatch& batch = constraintBatch[batchIndex];

		const PxU32 descIndexBatch = batch.mConstraintBatchIndex;
		const PxU32 responseIndex = batch.mArticulationResponseIndex;

		const PxU32 descStride = batch.mDescStride;

		//printf("Prep: Index = %i, descStride = %i, descIndexInBatch = %i\n", threadIndexInWarp, descStride, descIndexBatch);

		
		if (threadIndexInWarp < descStride)
		{
			const PxNodeIndex nodeIndexA = batch.bodyANodeIndex[threadIndexInWarp];
			const PxNodeIndex nodeIndexB = batch.bodyBNodeIndex[threadIndexInWarp];

			const PxU32 bodyAIndex = nodeIndexA.isArticulation() ? batch.remappedBodyAIndex[threadIndexInWarp] : batch.bodyAIndex[threadIndexInWarp];
			const PxU32 bodyBIndex = nodeIndexB.isArticulation() ? batch.remappedBodyBIndex[threadIndexInWarp] : batch.bodyBIndex[threadIndexInWarp];

			ContactParams params;
			fillBlockParams<ContactParams, IterativeData>(sharedDesc->iterativeData, params, descIndexBatch, batch.startConstraintIndex, batch.startFrictionIndex);

			//printf("Index = %i, descStride = %i, descIndexInBatch = %i\n", threadIndexInWarp, descStride, descIndexBatch);
			
			//printf("currFrictionPatches = %p, currFrictionAnchors = %p\n", currFrictionPatches, currFrictionAnchors);
			//port contact code
			PxgBlockContactData& contactData = contactCurrentPrepPool[descIndexBatch];
			PxgBlockContactPoint* baseContact = contactBase + batch.blockContactIndex;
			//printf("ContactCurrentPrepPool = %p, batch.blockContactIndex = %i\n", contactCurrentPrepPool, batch.blockContactIndex);
			PxgBlockFrictionPatch& frictionPatch = currFrictionPatches[descIndexBatch];
			PxgBlockFrictionAnchorPatch& fAnchor = currFrictionAnchors[descIndexBatch];
			//if(i >= nbContactBatches)
				/*printf("%i: currFrictionPatches = %p, currFrictionAnchors = %p, workUnits = %p, descIndexInBatch %i\n", i, currFrictionPatches, currFrictionAnchors,
				&workUnits[descIndexBatch], descIndexBatch);*/

			//Fill in correlation information for next frame...

			PxgBlockWorkUnit& unit = workUnits[descIndexBatch];

			PxgBlockFrictionIndex index;
			index.createPatchIndex(descIndexBatch, threadIndexInWarp);

			//printf("PxgBlockFrictionIndex filled in unit.mEdgeIndex = %i, unit.mPatchIndex = %i\n", unit.mEdgeIndex[threadIndexInWarp],
			//	unit.mPatchIndex[threadIndexInWarp]);

			//PxU32 frictionIndex = unit.mFrictionIndex[threadIndexInWarp];
			PxU32 edgeIndex = unit.mEdgeIndex[threadIndexInWarp];
			PxU32 frictionIndex = edgeIndex + totalCurrentEdges * unit.mPatchIndex[threadIndexInWarp];

			//printf("EdgeIndex = %i, frictionIndex = i, frictionIndices = %p\n", edgeIndex, frictionIndex, frictionIndices);

			PxgBlockFrictionIndex* targetIndex = &frictionIndices[frictionIndex];

			//printf("edgeIndex = %i, frictionIndex = %i\n", edgeIndex, frictionIndex);

			*reinterpret_cast<uint2*>(targetIndex) = reinterpret_cast<uint2&>(index);

			PxgSolverExtBody2 b0, b1;

			createPxgSolverExtBody(nodeIndexA, bodyAIndex, sharedDesc->articulations, solverBodyDatas, solverTxIDatas, b0, bodyFrames);
			createPxgSolverExtBody(nodeIndexB, bodyBIndex, sharedDesc->articulations, solverBodyDatas, solverTxIDatas, b1, bodyFrames);

			PxReal offsetSlop = PxMax(b0.offsetSlop, b1.offsetSlop);

			//printf("Created solverExtBodies\n");

			PxU32 offset = unit.mWriteback[threadIndexInWarp];
			/*artiCreateFinalizeSolverContactsBlockGPU(&contactData, baseContact, frictionPatch, prevFrictionPatches, fAnchor, prevFrictionAnchors, prevFrictionIndices, 
				b0, b1, sharedDesc->invDtF32, constraintPrepDesc->bounceThresholdF32, constraintPrepDesc->frictionOffsetThreshold, constraintPrepDesc->correlationDistance,
				threadIndexInWarp, offset, &contactHeaders[descIndexBatch], &frictionHeaders[descIndexBatch], &contactPoints[batch.startConstraintIndex],
				&frictions[batch.startFrictionIndex], &responses[responseIndex], totalPreviousEdges, edgeIndex, constraintPrepDesc->ccdMaxSeparation, constraintPrepDesc->solverOffsetSlop);*/
			//if (i<nbContactBatches)
			artiCreateFinalizeSolverContactsBlockGPU(&contactData, baseContact, frictionPatch, prevFrictionPatches, fAnchor, prevFrictionAnchors, prevFrictionIndices,
				b0, b1, invDt, dt, invTotalDt, constraintPrepDesc->bounceThresholdF32, constraintPrepDesc->frictionOffsetThreshold, constraintPrepDesc->correlationDistance,
				constraintPrepDesc->biasCoefficient, threadIndexInWarp, offset, params, &responses[responseIndex], totalPreviousEdges, edgeIndex, constraintPrepDesc->ccdMaxSeparation, offsetSlop,
				unit.mTorsionalFrictionData[threadIndexInWarp], totalDt);

			frictionPatch.patchIndex[threadIndexInWarp] = unit.mFrictionPatchIndex[threadIndexInWarp];

			PxgBlockFrictionPatch& fpatch = frictionPatch;
			if (fpatch.anchorCount[threadIndexInWarp] >= 1)
				fpatch.anchorPoints[0][threadIndexInWarp] = PxSave3(b0.body2World.transform(PxLoad3(fAnchor.body0Anchors[0][threadIndexInWarp])));
			if (fpatch.anchorCount[threadIndexInWarp] == 2)
				fpatch.anchorPoints[1][threadIndexInWarp] = PxSave3(b0.body2World.transform(PxLoad3(fAnchor.body0Anchors[1][threadIndexInWarp])));
		}
	}

}

extern "C" __global__ void artiContactConstraintBlockPrepareLaunch(
	PxgConstraintPrepareDesc* constraintPrepDesc,
	PxgSolverSharedDesc<IterativeSolveData>* sharedDesc)
{
	artiContactConstraintBlockPrepare<PxgContactBlockParams, IterativeSolveData>(constraintPrepDesc, sharedDesc, sharedDesc->invDtF32, sharedDesc->dt,
		sharedDesc->invDtF32, sharedDesc->dt);
}

extern "C" __global__ void artiTGSContactConstraintBlockPrepareLaunch(
	PxgConstraintPrepareDesc* constraintPrepDesc,
	PxgSolverSharedDesc<IterativeSolveDataTGS>* sharedDesc)
{
	artiContactConstraintBlockPrepare<PxgTGSContactBlockParams, IterativeSolveDataTGS>(constraintPrepDesc, sharedDesc, sharedDesc->stepInvDtF32, sharedDesc->stepDt,
		sharedDesc->invDtF32, sharedDesc->dt);
}

static __device__ void preprocessRowsBlock(
	PxU32* sortedRowIndices, 
	PxgBlockConstraint1DData* constraintData,
	PxgBlockConstraint1DVelocities* rowVelocities, 
	PxgBlockConstraint1DParameters* rowParameters,
	const PxU32 threadIndex)
{
	using namespace physx;

	const PxU32 numRows = constraintData->mNumRows[threadIndex];

	for (PxU32 i = 0; i < numRows; i++)
	{
		PxgBlockConstraint1DParameters& r = rowParameters[i];

		PxU32 j = i;
		if (j > 0)
			assert(threadIndex < 32); //PxgBlockConstraint1DParameters::solveHint has only 32 elements
		for (; j > 0 && r.solveHint[threadIndex] < rowParameters[sortedRowIndices[j - 1]].solveHint[threadIndex]; j--)
		{
			sortedRowIndices[j] = sortedRowIndices[j - 1];
		}

		sortedRowIndices[j] = i;
	}
}

//PGS 1D constraints
static __device__ PxU32 setUpArti1DConstraintBlock(
	const PxU32* PX_RESTRICT sortedRowIndices,
	PxgBlockConstraint1DData* PX_RESTRICT constraintData,
	PxgBlockConstraint1DVelocities* PX_RESTRICT rowVelocities,
	PxgBlockConstraint1DParameters* PX_RESTRICT rowParameters,
	PxgBlockSolverConstraint1DCon* PX_RESTRICT constraintsCon,
	PxgBlockSolverConstraint1DMod* PX_RESTRICT constraintsMod,
	PxgArticulationBlockResponse* PX_RESTRICT articulationResponses,
	float dt, float recipdt, 
	PxgSolverExtBody2& b0, PxgSolverExtBody2& b1,
	const physx::PxgSolverBodyPrepData* bodyData0, 
	const physx::PxgSolverBodyPrepData* bodyData1,
	const PxU32 threadIndex)
{
	using namespace physx;

	const PxReal erp = 1.0f;
	PxU32 outCount = 0;
	
	PxgArticulationBlockResponse* PX_RESTRICT artiResponse = articulationResponses;

	const PxU32 numRows = constraintData->mNumRows[threadIndex];

	const float4 lin0X_ang0Y_lin1Z_ang1W = constraintData->mInvMassScale[threadIndex].lin0X_ang0Y_lin1Z_ang1W;

	const PxReal cfm = PxMax(b0.cfm, b1.cfm);

	for (PxU32 i = 0; i < numRows; i++)
	{
		PxgBlockSolverConstraint1DCon& ccon = constraintsCon[outCount];
		PxgBlockSolverConstraint1DMod& cmod = constraintsMod[outCount];
		const PxU32 index = sortedRowIndices[i];
		PxgBlockConstraint1DParameters& rp = rowParameters[index];
		PxgBlockConstraint1DVelocities& rv = rowVelocities[index];

		const PxU32 rpFlags = rp.flags[threadIndex];

		PxReal driveScale = rpFlags & Px1DConstraintFlag::eHAS_DRIVE_LIMIT && constraintData->mFlags[threadIndex] & PxConstraintFlag::eDRIVE_LIMITS_ARE_FORCES ? fminf(dt, 1.0f) : 1.0f;

		const float4 c_linear0XYZ_geometricErrorW = rv.linear0XYZ_geometricErrorW[threadIndex];
		const float4 c_linear1XYZ_minImpulseW = rv.linear1XYZ_minImpulseW[threadIndex];
		const float4 c_angular0XYZ_velocityTargetW = rv.angular0XYZ_velocityTargetW[threadIndex];
		const float4 c_angular1XYZ_maxImpulseW = rv.angular1XYZ_maxImpulseW[threadIndex];

		const PxVec3 clin0(c_linear0XYZ_geometricErrorW.x, c_linear0XYZ_geometricErrorW.y, c_linear0XYZ_geometricErrorW.z);
		const PxVec3 clin1(c_linear1XYZ_minImpulseW.x, c_linear1XYZ_minImpulseW.y, c_linear1XYZ_minImpulseW.z);
		const PxVec3 cang0(c_angular0XYZ_velocityTargetW.x, c_angular0XYZ_velocityTargetW.y, c_angular0XYZ_velocityTargetW.z);
		const PxVec3 cang1(c_angular1XYZ_maxImpulseW.x, c_angular1XYZ_maxImpulseW.y, c_angular1XYZ_maxImpulseW.z);

		const PxReal minImpulse = c_linear1XYZ_minImpulseW.w * driveScale;
		const PxReal maxImpulse = c_angular1XYZ_maxImpulseW.w * driveScale;

		const Cm::UnAlignedSpatialVector response0 = createImpulseResponseVector(b0, Cm::UnAlignedSpatialVector(cang0, clin0));
		const Cm::UnAlignedSpatialVector response1 = createImpulseResponseVector(b1, Cm::UnAlignedSpatialVector(-cang1, -clin1));

		const Cm::UnAlignedSpatialVector deltaV0 = getImpulseResponse(b0, Cm::UnAlignedSpatialVector(clin0, cang0).scale(lin0X_ang0Y_lin1Z_ang1W.x, lin0X_ang0Y_lin1Z_ang1W.y));
		const Cm::UnAlignedSpatialVector deltaV1 = getImpulseResponse(b1, Cm::UnAlignedSpatialVector(-clin1, -cang1).scale(lin0X_ang0Y_lin1Z_ang1W.z, lin0X_ang0Y_lin1Z_ang1W.w));
		const float resp0 = deltaV0.dot(response0);
		const float resp1 = deltaV1.dot(response1);

		float unitResponse = resp0 + resp1;//FAdd(resp0, resp1);

		cmod.ang0Writeback[threadIndex] = cang0;

		if (unitResponse <= DY_ARTICULATION_MIN_RESPONSE)
			continue; //Degenerate constraint, can't be solved so skip to avoid computation later in the solver!

		artiResponse->deltaRALin_x[threadIndex] = deltaV0.bottom.x;
		artiResponse->deltaRALin_y[threadIndex] = deltaV0.bottom.y;
		artiResponse->deltaRALin_z[threadIndex] = deltaV0.bottom.z;
		artiResponse->deltaRAAng_x[threadIndex] = deltaV0.top.x;
		artiResponse->deltaRAAng_y[threadIndex] = deltaV0.top.y;
		artiResponse->deltaRAAng_z[threadIndex] = deltaV0.top.z;

		artiResponse->deltaRBLin_x[threadIndex] = deltaV1.bottom.x;
		artiResponse->deltaRBLin_y[threadIndex] = deltaV1.bottom.y;
		artiResponse->deltaRBLin_z[threadIndex] = deltaV1.bottom.z;
		artiResponse->deltaRBAng_x[threadIndex] = deltaV1.top.x;
		artiResponse->deltaRBAng_y[threadIndex] = deltaV1.top.y;
		artiResponse->deltaRBAng_z[threadIndex] = deltaV1.top.z;
		artiResponse++;

		const bool needNormalVel = (rpFlags & Px1DConstraintFlag::eRESTITUTION)
			|| ((rpFlags & Px1DConstraintFlag::eSPRING) &&  (rpFlags & Px1DConstraintFlag::eACCELERATION_SPRING));

		PxReal jointSpeedForRestitutionBounce = 0.0f;
		PxReal initJointSpeed = 0.0f;
		const bool b0IsRigidDynamic = (b0.linkIndex == PxSolverConstraintDesc::RIGID_BODY);
		const bool b1IsRigidDynamic = (b1.linkIndex == PxSolverConstraintDesc::RIGID_BODY);
		if (needNormalVel || b0IsRigidDynamic || b1IsRigidDynamic)
		{
			const PxReal vel0 = projectVelocity(b0, Cm::UnAlignedSpatialVector(cang0, clin0));
			const PxReal vel1 = projectVelocity(b1, Cm::UnAlignedSpatialVector(cang1, clin1));

			Dy::computeJointSpeedPGS(vel0, b0IsRigidDynamic, vel1, b1IsRigidDynamic, jointSpeedForRestitutionBounce, initJointSpeed);
		}

		// Raising flags for spring and acceleration spring.
		// This is needed for computing contact coefficients every sub-timestep or iteration.
		// See "queryReduced1dConstraintSolverConstantsPGS" and "compute1dConstraintSolverConstantsPGS".
		if (rpFlags & Px1DConstraintFlag::eSPRING)
			cmod.flags[threadIndex] |= DY_SC_FLAG_SPRING;

		if (rpFlags & Px1DConstraintFlag::eACCELERATION_SPRING)
			cmod.flags[threadIndex] |= DY_SC_FLAG_ACCELERATION_SPRING;

		intializeBlock1D(rv, rp, jointSpeedForRestitutionBounce, initJointSpeed, resp0, resp1, erp, dt, recipdt, ccon, cmod,
			response0.bottom, -response1.bottom, response0.top, -response1.top, minImpulse, maxImpulse, cfm, threadIndex);

		if (rpFlags & Px1DConstraintFlag::eOUTPUT_FORCE)
			cmod.flags[threadIndex] |= DY_SC_FLAG_OUTPUT_FORCE;

		outCount++;
	}

	return outCount;
}


static __device__ void intializeTGSBlock1D(
	const physx::PxgBlockConstraint1DVelocities& rv,
	const physx::PxgBlockConstraint1DParameters& rp,
	const float jointSpeedForRestitutionBounce, const PxReal initJointSpeed,
	const float resp0, const float resp1, const float erp,
	const float stepDt, const float simDt, const float recipStepDt, const float recipSimDt,
	const float lengthScale, const PxReal minRowResponse,
	const PxVec3& _linear0, const PxVec3& _linear1,
	const PxVec3& _angular0, const PxVec3& _angular1,
	const PxU32 threadIndex,
	PxgTGSBlockSolverConstraint1DCon& scon,
	PxReal cfm)
{
	using namespace physx;

	PxReal maxBiasVelocity;
	{	
		const PxU16 flags = PxU16(rp.flags[threadIndex]);
		const PxReal stiffness = rp.mods.spring.stiffness[threadIndex];
		const PxReal damping = rp.mods.spring.damping[threadIndex];
		const PxReal restitution = rp.mods.bounce.restitution[threadIndex];
		const PxReal bounceVelocityThreshold = rp.mods.bounce.velocityThreshold[threadIndex];
		const PxReal geometricError = rv.linear0XYZ_geometricErrorW[threadIndex].w;
		const PxReal velocityTarget = rv.angular0XYZ_velocityTargetW[threadIndex].w;

		maxBiasVelocity = Dy::computeMaxBiasVelocityTGS(flags, jointSpeedForRestitutionBounce, bounceVelocityThreshold, 
			restitution, geometricError, true, lengthScale, recipSimDt);

		// To use different mass for mass-splitting every sub-timestep (or iteration),
		// recipResponse, velMultipler, biasCoefficient, etc. are computed every sub-timestep (or iteration).
		// To compute them every sub-timestep (or iteration), additional 4 coefficients are stored; see queryReduced1dConstraintSolverConstantsTGS.
		// This does not change the previous impulse formulation, but a different mass is used due to mass-splitting.

		PxReal coeff0, coeff1, coeff2, coeff3;
		Dy::queryReduced1dConstraintSolverConstantsTGS(flags, stiffness, damping, restitution, bounceVelocityThreshold,
		                                               geometricError, velocityTarget, jointSpeedForRestitutionBounce,
		                                               initJointSpeed, erp, stepDt, recipStepDt, coeff0, coeff1, coeff2, coeff3);

		scon.lin0XYZ_initBiasOrCoeff0[threadIndex] = make_float4(_linear0.x, _linear0.y, _linear0.z, coeff0);
		scon.lin1XYZ_biasScaleOrCoeff1[threadIndex] = make_float4(_linear1.x, _linear1.y, _linear1.z, coeff1);
		scon.ang0XYZ_velMultiplierOrCoeff2[threadIndex] = make_float4(_angular0.x, _angular0.y, _angular0.z, coeff2);
		scon.ang1XYZ_velTargetOrCoeff3[threadIndex] = make_float4(_angular1.x, _angular1.y, _angular1.z, coeff3);
		scon.geometricError[threadIndex] = geometricError;
		scon.maxBias[threadIndex] = maxBiasVelocity;
		scon.appliedForce[threadIndex] = 0.f;
	}
}

//TGS 1D constraint
static __device__ PxU32 setUpArti1DConstraintBlock(
	const PxU32* PX_RESTRICT sortedRowIndices,
	PxgBlockConstraint1DData* PX_RESTRICT constraintData,
	PxgBlockConstraint1DVelocities* PX_RESTRICT rowVelocities,
	PxgBlockConstraint1DParameters* PX_RESTRICT rowParameters,
	PxgTGSBlockSolverConstraint1DCon* PX_RESTRICT constraintsCon,
	PxgArticulationBlockResponse* PX_RESTRICT articulationResponses,
	float stepDt, float simDt, 
	float recipStepDt, float recipSimDt,
	float lengthScale,
	const PxReal biasCoefficient,
	PxgSolverExtBody2& b0, PxgSolverExtBody2& b1,
	const physx::PxgSolverBodyPrepData* bodyData0,
	const physx::PxgSolverBodyPrepData* bodyData1,
	const PxU32 threadIndex)
{
	using namespace physx;

	const PxReal erp = 0.5f * biasCoefficient;
	
	PxU32 outCount = 0;

	PxgArticulationBlockResponse* PX_RESTRICT artiResponse = articulationResponses;

	const PxU32 numRows = constraintData->mNumRows[threadIndex];
	const float4 lin0X_ang0Y_lin1Z_ang1W = constraintData->mInvMassScale[threadIndex].lin0X_ang0Y_lin1Z_ang1W;

	const PxReal cfm = PxMax(b0.cfm, b1.cfm);

	for (PxU32 i = 0; i < numRows; i++)
	{
		PxgTGSBlockSolverConstraint1DCon& ccon = constraintsCon[outCount];
	
		const PxU32 index = sortedRowIndices[i];
		PxgBlockConstraint1DParameters& rp = rowParameters[index];
		PxgBlockConstraint1DVelocities& rv = rowVelocities[index];

		const PxU32 rpFlags = rp.flags[threadIndex];
		const PxU32 rpSolveHint = rp.solveHint[threadIndex];

		const float4 c_linear0XYZ_geometricErrorW = rv.linear0XYZ_geometricErrorW[threadIndex];
		const float4 c_linear1XYZ_minImpulseW = rv.linear1XYZ_minImpulseW[threadIndex];
		const float4 c_angular0XYZ_velocityTargetW = rv.angular0XYZ_velocityTargetW[threadIndex];
		const float4 c_angular1XYZ_maxImpulseW = rv.angular1XYZ_maxImpulseW[threadIndex];

		const PxVec3 clin0(c_linear0XYZ_geometricErrorW.x, c_linear0XYZ_geometricErrorW.y, c_linear0XYZ_geometricErrorW.z);
		const PxVec3 clin1(c_linear1XYZ_minImpulseW.x, c_linear1XYZ_minImpulseW.y, c_linear1XYZ_minImpulseW.z);
		const PxVec3 cang0(c_angular0XYZ_velocityTargetW.x, c_angular0XYZ_velocityTargetW.y, c_angular0XYZ_velocityTargetW.z);
		const PxVec3 cang1(c_angular1XYZ_maxImpulseW.x, c_angular1XYZ_maxImpulseW.y, c_angular1XYZ_maxImpulseW.z);

		const Cm::UnAlignedSpatialVector response0 = createImpulseResponseVector(b0, Cm::UnAlignedSpatialVector(cang0, clin0));
		const Cm::UnAlignedSpatialVector response1 = createImpulseResponseVector(b1, Cm::UnAlignedSpatialVector(-cang1, -clin1));

		const Cm::UnAlignedSpatialVector deltaV0 = getImpulseResponse(b0, Cm::UnAlignedSpatialVector(clin0, cang0).scale(lin0X_ang0Y_lin1Z_ang1W.x, lin0X_ang0Y_lin1Z_ang1W.y));
		const Cm::UnAlignedSpatialVector deltaV1 = getImpulseResponse(b1, Cm::UnAlignedSpatialVector(-clin1, -cang1).scale(lin0X_ang0Y_lin1Z_ang1W.z, lin0X_ang0Y_lin1Z_ang1W.w));

		const float resp0 = deltaV0.dot(response0);
		const float resp1 = deltaV1.dot(response1);

		float unitResponse = resp0 + resp1;//FAdd(resp0, resp1);

		if (unitResponse <= DY_ARTICULATION_MIN_RESPONSE)
			continue; //Degenerate constraint, can't be solved so skip to avoid computation later in the solver!

		ccon.resp0[threadIndex] = resp0;
		ccon.resp1[threadIndex] = resp1;

		artiResponse->deltaRALin_x[threadIndex] = deltaV0.bottom.x;
		artiResponse->deltaRALin_y[threadIndex] = deltaV0.bottom.y;
		artiResponse->deltaRALin_z[threadIndex] = deltaV0.bottom.z;
		artiResponse->deltaRAAng_x[threadIndex] = deltaV0.top.x;
		artiResponse->deltaRAAng_y[threadIndex] = deltaV0.top.y;
		artiResponse->deltaRAAng_z[threadIndex] = deltaV0.top.z;
		artiResponse->deltaRBLin_x[threadIndex] = deltaV1.bottom.x;
		artiResponse->deltaRBLin_y[threadIndex] = deltaV1.bottom.y;
		artiResponse->deltaRBLin_z[threadIndex] = deltaV1.bottom.z;
		artiResponse->deltaRBAng_x[threadIndex] = deltaV1.top.x;
		artiResponse->deltaRBAng_y[threadIndex] = deltaV1.top.y;
		artiResponse->deltaRBAng_z[threadIndex] = deltaV1.top.z;
		artiResponse++;

		PxReal jointSpeedForRestitutionBounce;
		PxReal initJointSpeed;
		{
			const PxReal vel0 = projectVelocity(b0, Cm::UnAlignedSpatialVector(cang0, clin0));
			const PxReal vel1 = projectVelocity(b1, Cm::UnAlignedSpatialVector(cang1, clin1));
			Dy::computeJointSpeedTGS(vel0, b0.isKinematic, vel1, b1.isKinematic, jointSpeedForRestitutionBounce, initJointSpeed);
		}

		//https://omniverse-jirasw.nvidia.com/browse/PX-4383
		const PxReal minRowResponse = DY_ARTICULATION_MIN_RESPONSE;
	
		intializeTGSBlock1D(
			rv, rp, 
			jointSpeedForRestitutionBounce, initJointSpeed, 
			resp0, resp1, erp, 
			stepDt, simDt, recipStepDt, recipSimDt, 
			lengthScale, minRowResponse, 
			response0.bottom, -response1.bottom, response0.top, -response1.top, 
			threadIndex, 
			ccon, cfm);

		ccon.angularErrorScale[threadIndex] = (rp.flags[threadIndex] & Px1DConstraintFlag::eANGULAR_CONSTRAINT) ? 1.f : 0.f;

		const bool hasDriveLimit = rpFlags & Px1DConstraintFlag::eHAS_DRIVE_LIMIT;
		const bool driveLimitsAreForces = constraintData->mFlags[threadIndex] & PxConstraintFlag::eDRIVE_LIMITS_ARE_FORCES;
		Dy::computeMinMaxImpulseOrForceAsImpulse(
			c_linear1XYZ_minImpulseW.w, c_angular1XYZ_maxImpulseW.w, 
			hasDriveLimit, driveLimitsAreForces, simDt, 
			ccon.minImpulse[threadIndex], ccon.maxImpulse[threadIndex]);

		PxU32 flags = 0;
		Dy::raiseInternalFlagsTGS(rpFlags, rpSolveHint, flags);

		ccon.flags[threadIndex] = flags;

		outCount++;
	}

	return outCount;
}

//PGS
template<int NbThreads>
static __device__ void setupArtiSolverConstraintBlockGPU(
	PxgBlockConstraint1DData* PX_RESTRICT constraintData,
	PxgBlockConstraint1DVelocities* PX_RESTRICT rowVelocities,
	PxgBlockConstraint1DParameters* PX_RESTRICT rowParameters,
	const PxgSolverBodyPrepData* bodyData0,
	const PxgSolverBodyPrepData* bodyData1,
	PxgSolverTxIData* txIData0,
	PxgSolverTxIData* txIData1,
	PxgBlockConstraintBatch& batch,
	const PxU32 threadIndexInWarp,
	PxgJointBlockParams& params,
	PxgArticulationBlockResponse* PX_RESTRICT artiResponses,
	const PxgSolverConstraintManagerConstants& managerConstants,
	PxgSolverExtBody2& b0,
	PxgSolverExtBody2& b1)
{
	using namespace physx;

	//distance constraint might have zero number of rows	
	params.jointHeader->rowCounts[threadIndexInWarp] = PxU8(constraintData->mNumRows[threadIndexInWarp]);

	params.jointHeader->writeBackOffset[threadIndexInWarp] = managerConstants.mConstraintWriteBackIndex;

	const float4 lin0_ang0_lin1_ang1 = constraintData->mInvMassScale[threadIndexInWarp].lin0X_ang0Y_lin1Z_ang1W;

	const float4 raWorld_linBreakForce = constraintData->mRAWorld_linBreakForce[threadIndexInWarp];
	const float4 rbWorld_angBreakForce = constraintData->mRBWorld_AngBreakForce[threadIndexInWarp];
	const float linBreakImpulse = raWorld_linBreakForce.w * params.dt;
	const float angBreakForce = rbWorld_angBreakForce.w;
	const float angBreakImpulse = angBreakForce * params.dt;
	params.jointHeader->body0WorldOffset_linBreakImpulse[threadIndexInWarp] = make_float4(raWorld_linBreakForce.x, raWorld_linBreakForce.y, raWorld_linBreakForce.z, linBreakImpulse);
	params.jointHeader->angBreakImpulse[threadIndexInWarp] = angBreakImpulse;

	params.jointHeader->invMass0D0[threadIndexInWarp] = -lin0_ang0_lin1_ang1.x;
	params.jointHeader->invMass1D1[threadIndexInWarp] = lin0_ang0_lin1_ang1.z;
	params.jointHeader->invInertiaScale0[threadIndexInWarp] = -lin0_ang0_lin1_ang1.y;
	params.jointHeader->invInertiaScale1[threadIndexInWarp] = lin0_ang0_lin1_ang1.w;

	params.jointHeader->breakable[threadIndexInWarp] = PxU8((raWorld_linBreakForce.w != PX_MAX_F32) || (angBreakForce != PX_MAX_F32));

	const PxReal cfm = PxMax(b0.cfm, b1.cfm);
	params.jointHeader->cfm[threadIndexInWarp] = cfm;

	__shared__ PxU32 sortedRowIndices[NbThreads][Dy::MAX_CONSTRAINT_ROWS];

	preprocessRowsBlock(sortedRowIndices[threadIdx.x + threadIdx.y * blockDim.x], constraintData, rowVelocities, rowParameters, threadIndexInWarp);

	params.jointHeader->rowCounts[threadIndexInWarp] = setUpArti1DConstraintBlock(sortedRowIndices[threadIdx.x + threadIdx.y * blockDim.x], constraintData,
		rowVelocities, rowParameters, params.jointCon, params.jointMod, artiResponses, params.dt, params.invDt, b0, b1, bodyData0, bodyData1, threadIndexInWarp);
}

//TGS
template<int NbThreads>
static __device__ void setupArtiSolverConstraintBlockGPU(
	PxgBlockConstraint1DData* PX_RESTRICT constraintData,
	PxgBlockConstraint1DVelocities* PX_RESTRICT rowVelocities,
	PxgBlockConstraint1DParameters* PX_RESTRICT rowParameters,
	const PxgSolverBodyPrepData* bodyData0,
	const PxgSolverBodyPrepData* bodyData1,
	PxgSolverTxIData* txIData0,
	PxgSolverTxIData* txIData1,
	PxgBlockConstraintBatch& batch,
	const PxU32 threadIndexInWarp,
	PxgTGSJointBlockParams& params,
	PxgArticulationBlockResponse* PX_RESTRICT artiResponses,
	const PxgSolverConstraintManagerConstants& managerConstants,
	PxgSolverExtBody2& b0,
	PxgSolverExtBody2& b1)
{
	using namespace physx;

	//distance constraint might have zero number of rows	
	//params.jointHeader->rowCounts_breakable_orthoAxisCount[threadIndexInWarp].x = PxU8(constraintData->mNumRows[threadIndexInWarp]);

	params.jointHeader->writeBackOffset[threadIndexInWarp] = managerConstants.mConstraintWriteBackIndex;

	const float4 lin0_ang0_lin1_ang1 = constraintData->mInvMassScale[threadIndexInWarp].lin0X_ang0Y_lin1Z_ang1W;
	 
	const float4 raWorld_linBreakForce = constraintData->mRAWorld_linBreakForce[threadIndexInWarp];
	const float4 rbWorld_angBreakForce = constraintData->mRBWorld_AngBreakForce[threadIndexInWarp];
	const float linBreakImpulse = raWorld_linBreakForce.w * params.dt;
	const float angBreakForce = rbWorld_angBreakForce.w;
	const float angBreakImpulse = angBreakForce * params.dt;
	params.jointHeader->rAWorld_invMass0D0[threadIndexInWarp] = make_float4(raWorld_linBreakForce.x, raWorld_linBreakForce.y, raWorld_linBreakForce.z, -lin0_ang0_lin1_ang1.x);
	params.jointHeader->rBWorld_invMass1D1[threadIndexInWarp] = make_float4(rbWorld_angBreakForce.x, rbWorld_angBreakForce.y, rbWorld_angBreakForce.z, lin0_ang0_lin1_ang1.z);

	//params.jointHeader->body0WorldOffset_linBreakImpulse[threadIndexInWarp] = make_float4(raWorld_linBreakForce.x, raWorld_linBreakForce.y, raWorld_linBreakForce.z, linBreakImpulse);
	params.jointHeader->angBreakImpulse[threadIndexInWarp] = angBreakImpulse;

	//params.jointHeader->invMass0D0[threadIndexInWarp] = -lin0_ang0_lin1_ang1.x;
	//params.jointHeader->invMass1D1[threadIndexInWarp] = lin0_ang0_lin1_ang1.z;
	params.jointHeader->invInertiaScale0[threadIndexInWarp] = -lin0_ang0_lin1_ang1.y;
	params.jointHeader->invInertiaScale1[threadIndexInWarp] = lin0_ang0_lin1_ang1.w;

	params.jointHeader->linBreakImpulse[threadIndexInWarp] = linBreakImpulse;

	PxU8 rowCounts = PxU8(constraintData->mNumRows[threadIndexInWarp]);
	const PxU8 breakable = PxU8((raWorld_linBreakForce.w != PX_MAX_F32) || (angBreakForce != PX_MAX_F32));
	
	__shared__ PxU32 sortedRowIndices[NbThreads][Dy::MAX_CONSTRAINT_ROWS];

	preprocessRowsBlock(sortedRowIndices[threadIdx.x + threadIdx.y * blockDim.x], constraintData, rowVelocities, rowParameters, threadIndexInWarp);

	rowCounts = setUpArti1DConstraintBlock(sortedRowIndices[threadIdx.x + threadIdx.y * blockDim.x], constraintData,
		rowVelocities, rowParameters, 
		params.jointCon, artiResponses, params.dt, params.totalDt, params.invDt, params.invTotalDt, 
		params.lengthScale, params.biasCoefficient, b0, b1, bodyData0, bodyData1, threadIndexInWarp);

	params.jointHeader->rowCounts_breakable_orthoAxisCount[threadIndexInWarp] = make_uchar4(rowCounts, breakable, 0, 0);

	const PxReal cfm = PxMax(b0.cfm, b1.cfm);
	params.jointHeader->cfm[threadIndexInWarp] = cfm;
}


//PGS
__device__ void fillJointBlockParams(PxgSolverSharedDesc<IterativeSolveData>* sharedDesc, 
	PxgConstraintPrepareDesc* constraintPrepDesc, PxgJointBlockParams& params,
	const PxU32 descIndexBatch, const PxU32 startConstraintIndex)
{
	IterativeSolveData& solveData = sharedDesc->iterativeData;
	params.jointHeader = &solveData.blockJointConstraintHeaders[descIndexBatch];
	params.jointCon = &solveData.blockJointConstraintRowsCon[startConstraintIndex];
	params.jointMod = &solveData.blockJointConstraintRowsMod[startConstraintIndex];
	params.dt = sharedDesc->dt;
	params.invDt = sharedDesc->invDtF32;
}

//TGS
__device__ void fillJointBlockParams(PxgSolverSharedDesc<IterativeSolveDataTGS>* sharedDesc,
	PxgConstraintPrepareDesc* constraintPrepDesc, PxgTGSJointBlockParams& params, const PxU32 descIndexBatch, const PxU32 startConstraintIndex)
{
	IterativeSolveDataTGS& solveData = sharedDesc->iterativeData;
	params.jointHeader = &solveData.blockJointConstraintHeaders[descIndexBatch];
	params.jointCon = &solveData.blockJointConstraintRowsCon[startConstraintIndex];
	params.dt = sharedDesc->stepDt;
	params.totalDt = sharedDesc->dt;
	params.invDt = sharedDesc->stepInvDtF32;
	params.invTotalDt = sharedDesc->invDtF32;
	params.lengthScale = sharedDesc->lengthScale;
	params.biasCoefficient = constraintPrepDesc->biasCoefficient;
}

template<typename JointParams, typename IterativeData>
__device__ void artiJointConstraintBlockPrepare(
	PxgConstraintPrepareDesc* constraintPrepDesc,
	PxgSolverSharedDesc<IterativeData>* sharedDesc)
{

	//This identifies which warp a specific thread is in, we treat all warps in all blocks as a flatten warp array
	//and we are going to index the work based on that
	const PxU32 warpIndex = blockIdx.x * blockDim.y + threadIdx.y;

	//This identifies which thread within a warp a specific thread is
	const PxU32 threadIndexInWarp = threadIdx.x;

	PxgSolverBodyData* solverBodyData = constraintPrepDesc->solverBodyDataPool;
	PxgSolverTxIData* solverTxIData = constraintPrepDesc->solverBodyTxIDataPool;

	PxAlignedTransform* bodyFrames = constraintPrepDesc->body2WorldPool;

	PxU32* batchIndices = constraintPrepDesc->artiJointConstraintBatchIndices;

	const PxU32 totalArti1dConstraintBatches = constraintPrepDesc->numArti1dConstraintBatches + constraintPrepDesc->numArtiStatic1dConstraintBatches
		+ constraintPrepDesc->numArtiSelf1dConstraintBatches;

	const PxU32 offset = constraintPrepDesc->numBatches;

	//for(PxU32 i=warpIndex; i< constraintPrepDesc->num1dConstraintBatches; i+=totalNumWarps)
	PxU32 i = warpIndex;
	if (i < totalArti1dConstraintBatches)
	{
		const PxU32 batchIndex = batchIndices[i] + offset;
		
		PxgBlockConstraintBatch& batch = sharedDesc->iterativeData.blockConstraintBatch[batchIndex];
		//const PxU32 bodyAIndex = batch.bodyAIndex[threadIndexInWarp];
		//const PxU32 bodyBIndex = batch.bodyBIndex[threadIndexInWarp];

		const PxU32 descIndexBatch = batch.mConstraintBatchIndex;

		const PxU32 descStride = batch.mDescStride;

#if LOAD_BODY_DATA
		loadBodyData(solverBodyDatas, descStride, bodyAIndex, threadIndexInWarp, warpIndexInBlock, bodyData0.initialLinVelXYZ_invMassW, bodyData0.initialAngVelXYZ_penBiasClamp,
			bodyData0.sqrtInvInertia, bodyData0.body2World);
		loadBodyData(solverBodyDatas, descStride, bodyBIndex, threadIndexInWarp, warpIndexInBlock, bodyData1.initialLinVelXYZ_invMassW, bodyData1.initialAngVelXYZ_penBiasClamp,
			bodyData1.sqrtInvInertia, bodyData1.body2World);
#endif

		//mDescStride might less than 32, we need to guard against it
		if (threadIndexInWarp < descStride)
		{
			//desc.descIndex for joint in fact is the batch index
			PxgBlockConstraint1DData& constraintData = constraintPrepDesc->blockJointPrepPool[descIndexBatch];
			PxgBlockConstraint1DVelocities* rowVelocities = &constraintPrepDesc->blockJointPrepPool0[descIndexBatch * Dy::MAX_CONSTRAINT_ROWS];
			PxgBlockConstraint1DParameters* rowParameters = &constraintPrepDesc->blockJointPrepPool1[descIndexBatch * Dy::MAX_CONSTRAINT_ROWS];

			const PxNodeIndex nodeIndexA = batch.bodyANodeIndex[threadIndexInWarp];
			const PxNodeIndex nodeIndexB = batch.bodyBNodeIndex[threadIndexInWarp];

			const PxU32 bodyAIndex = nodeIndexA.isArticulation() ? batch.remappedBodyAIndex[threadIndexInWarp] : batch.bodyAIndex[threadIndexInWarp];
			const PxU32 bodyBIndex = nodeIndexB.isArticulation() ? batch.remappedBodyBIndex[threadIndexInWarp] : batch.bodyBIndex[threadIndexInWarp];

			PxgSolverBodyData* bodyData0 = &solverBodyData[bodyAIndex];
			PxgSolverBodyData* bodyData1 = &solverBodyData[bodyBIndex];
			PxgSolverTxIData* txIData0 = &solverTxIData[bodyAIndex];
			PxgSolverTxIData* txIData1 = &solverTxIData[bodyBIndex];
			
			PxgSolverExtBody2 b0, b1;
			createPxgSolverExtBody(nodeIndexA, bodyAIndex, sharedDesc->articulations, solverBodyData, solverTxIData, b0, bodyFrames);
			createPxgSolverExtBody(nodeIndexB, bodyBIndex, sharedDesc->articulations, solverBodyData, solverTxIData, b1, bodyFrames);

			/*PxU32 isArticulationA = b0.islandNodeIndex.isArticulation();
			PxU32 isArticulationB = b1.islandNodeIndex.isArticulation();*/

		/*	printf("isArticulationA %i isArticulationB %i, linkID0 %i, linkID1 %i\n", isArticulationA, isArticulationB,
				nodeIndexA.articulationLinkId(), nodeIndexB.articulationLinkId());*/

			JointParams params;
			fillJointBlockParams(sharedDesc, constraintPrepDesc, params, descIndexBatch, batch.startConstraintIndex);

			PxU32 uniqueIndex = constraintPrepDesc->artiConstraintUniqueIndices[batch.mStartPartitionIndex + threadIndexInWarp];
			
			PxgArticulationBlockResponse* responses = sharedDesc->iterativeData.artiResponse;
			const PxU32 responseIndex = batch.mArticulationResponseIndex;

			////printf("constraintPre2 startConstraintIndex %i\n", batch.startConstraintIndex);
			//setupArtiSolverConstraintBlockGPU<PxgKernelBlockDim::CONSTRAINT_PREPARE_BLOCK_PARALLEL>(&constraintData, rowVelocities, rowParameters, bodyData0, bodyData1, txIData0, txIData1, sharedDesc->dt, sharedDesc->invDtF32, batch, threadIndexInWarp,
			//	&jointConstraintHeaders[descIndexBatch], &jointConstraintRowsCon[batch.startConstraintIndex], &jointConstraintRowsMod[batch.startConstraintIndex], &responses[responseIndex],
			//	constraintPrepDesc->solverConstantData[uniqueIndex], b0, b1);

			//Attention: PxgKernelBlockDim::ARTI_CONSTRAINT_PREPARE is not launched as a linear block but as a 2D block. thradIdx.x and threadIdx.y are used for indexing inside the block.
			setupArtiSolverConstraintBlockGPU<PxgKernelBlockDim::ARTI_CONSTRAINT_PREPARE>(&constraintData, rowVelocities, rowParameters, bodyData0, bodyData1, txIData0, txIData1,
				batch, threadIndexInWarp, params, &responses[responseIndex],
				constraintPrepDesc->solverConstantData[uniqueIndex], b0, b1);
		}
	}
}


extern "C" __global__ void artiJointConstraintBlockPrepareParallelLaunch(PxgConstraintPrepareDesc* prepDesc, 
	PxgSolverSharedDesc<IterativeSolveData>* sharedDesc)
{
	artiJointConstraintBlockPrepare<PxgJointBlockParams, IterativeSolveData>(prepDesc, sharedDesc);
}

extern "C" __global__ void artiTGSJointConstraintBlockPrepareParallelLaunch(PxgConstraintPrepareDesc* prepDesc, 
	PxgSolverSharedDesc<IterativeSolveDataTGS>* sharedDesc)
{
	artiJointConstraintBlockPrepare<PxgTGSJointBlockParams, IterativeSolveDataTGS>(prepDesc, sharedDesc);
}