feat(physics): wire physx sdk into build

2026-04-15 12:22:15 +08:00
parent 5bf258df6d
commit 31f40e2cbb
2044 changed files with 752623 additions and 1 deletions
--- a/engine/third_party/physx/source/gpusimulationcontroller/include/PxgArticulation.h
+++ b/engine/third_party/physx/source/gpusimulationcontroller/include/PxgArticulation.h
@@ -0,0 +1,976 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#ifndef PXG_ARTICULATION_H
+#define PXG_ARTICULATION_H
+
+#include "foundation/PxSimpleTypes.h"
+#include "PxArticulationReducedCoordinate.h"
+#include "PxgArticulationLink.h"
+#include "PxgArticulationTendon.h"
+#include "PxgCudaBuffer.h"
+#include "PxSpatialMatrix.h"
+#include "DyFeatherstoneArticulation.h"
+#include "foundation/PxUserAllocated.h"
+#include "vector_types.h"
+
+namespace physx
+{
+	class PxGpuSpatialTendonData;
+	class PxGpuFixedTendonData;
+
+	namespace Dy
+	{
+		struct ArticulationJointCore;
+		class ArticulationJointCoreData;
+		struct ArticulationInternalConstraint;
+		struct SpatialSubspaceMatrix;
+	}
+
+	//We load/store to Cm::SpatialMatrix rather than PxSpatialMatrix.
+	//This exploits the symmetry of the spatial matrix to avoid storing/loading the bottom-right,
+	//which is the transpose of the top-left
+	struct PxgSpatialMatrixBlock
+	{
+		float4	columns[7][32];
+	};
+
+	//We load/store to a PxSpatialMatrix. This has a full 6x6 matrix as there is no
+	//symmetry as described in the above struct
+	struct PxgSpatialMatrix6x6Block
+	{
+		float4	columns[9][32];
+	};
+
+	struct PxgMat33Block
+	{
+		float4	mCol0[32];
+		float4	mCol1[32];
+		float	mCol2[32];
+	};
+
+	struct PxgSpatialVectorBlock
+	{
+		float4 mTopxyz_bx[32];
+		float2 mbyz[32];
+	};
+
+	struct PxgSpatialTransformBlock
+	{
+		float4 q[32];
+		float4 p[32];
+	};
+
+
+	PX_FORCE_INLINE PX_CUDA_CALLABLE void loadSpatialMatrix(const PxgSpatialMatrixBlock& block, const PxU32 threadIndexInWarp, Dy::SpatialMatrix& mat)
+	{
+		float4 val = block.columns[0][threadIndexInWarp];
+		mat.topLeft[0][0] = val.x; mat.topLeft[0][1] = val.y; mat.topLeft[0][2] = val.z; mat.topLeft[1][0] = val.w;
+		val = block.columns[1][threadIndexInWarp];
+		mat.topLeft[1][1] = val.x; mat.topLeft[1][2] = val.y; mat.topLeft[2][0] = val.z; mat.topLeft[2][1] = val.w;
+		val = block.columns[2][threadIndexInWarp];
+		mat.topLeft[2][2] = val.x; mat.topRight[0][0] = val.y; mat.topRight[0][1] = val.z; mat.topRight[0][2] = val.w;
+		val = block.columns[3][threadIndexInWarp];
+		mat.topRight[1][0] = val.x; mat.topRight[1][1] = val.y; mat.topRight[1][2] = val.z; mat.topRight[2][0] = val.w;
+		val = block.columns[4][threadIndexInWarp];
+		mat.topRight[2][1] = val.x; mat.topRight[2][2] = val.y; mat.bottomLeft[0][0] = val.z; mat.bottomLeft[0][1] = val.w;
+		val = block.columns[5][threadIndexInWarp];
+		mat.bottomLeft[0][2] = val.x; mat.bottomLeft[1][0] = val.y; mat.bottomLeft[1][1] = val.z; mat.bottomLeft[1][2] = val.w;
+		val = block.columns[6][threadIndexInWarp];
+		mat.bottomLeft[2][0] = val.x; mat.bottomLeft[2][1] = val.y; mat.bottomLeft[2][2] = val.z;
+	}
+
+	//This loads a 7-vec matrix in which the bottomRight is equal to the transpose of the top left
+	PX_FORCE_INLINE PX_CUDA_CALLABLE void loadSpatialMatrix(const PxgSpatialMatrixBlock& block, const PxU32 threadIndexInWarp, PxSpatialMatrix& mat)
+	{
+		float4 val = block.columns[0][threadIndexInWarp];
+		mat.column[0][0] = val.x; mat.column[0][1] = val.y; mat.column[0][2] = val.z; mat.column[1][0] = val.w;
+		mat.column[3][3] = val.x; mat.column[4][3] = val.y; mat.column[5][3] = val.z; mat.column[3][4] = val.w;
+		val = block.columns[1][threadIndexInWarp];
+		mat.column[1][1] = val.x; mat.column[1][2] = val.y; mat.column[2][0] = val.z; mat.column[2][1] = val.w;
+		mat.column[4][4] = val.x; mat.column[5][4] = val.y; mat.column[3][5] = val.z; mat.column[4][5] = val.w;
+		val = block.columns[2][threadIndexInWarp];
+		mat.column[2][2] = val.x; mat.column[0][3] = val.y; mat.column[0][4] = val.z; mat.column[0][5] = val.w;
+		mat.column[5][5] = val.x;
+		val = block.columns[3][threadIndexInWarp];
+		mat.column[1][3] = val.x; mat.column[1][4] = val.y; mat.column[1][5] = val.z; mat.column[2][3] = val.w;
+		val = block.columns[4][threadIndexInWarp];
+		mat.column[2][4] = val.x; mat.column[2][5] = val.y; mat.column[3][0] = val.z; mat.column[3][1] = val.w;
+		val = block.columns[5][threadIndexInWarp];
+		mat.column[3][2] = val.x; mat.column[4][0] = val.y; mat.column[4][1] = val.z; mat.column[4][2] = val.w;
+		val = block.columns[6][threadIndexInWarp];
+		mat.column[5][0] = val.x; mat.column[5][1] = val.y; mat.column[5][2] = val.z;
+	}
+
+	PX_FORCE_INLINE PX_CUDA_CALLABLE void storeSpatialMatrix(PxgSpatialMatrixBlock& block, const PxU32 threadIndexInWarp, const Dy::SpatialMatrix& mat)
+	{
+		block.columns[0][threadIndexInWarp] = make_float4(mat.topLeft[0][0], mat.topLeft[0][1], mat.topLeft[0][2], mat.topLeft[1][0]);
+		block.columns[1][threadIndexInWarp] = make_float4(mat.topLeft[1][1], mat.topLeft[1][2], mat.topLeft[2][0], mat.topLeft[2][1]);
+		block.columns[2][threadIndexInWarp] = make_float4(mat.topLeft[2][2], mat.topRight[0][0], mat.topRight[0][1], mat.topRight[0][2]);
+		block.columns[3][threadIndexInWarp] = make_float4(mat.topRight[1][0], mat.topRight[1][1], mat.topRight[1][2], mat.topRight[2][0]);
+		block.columns[4][threadIndexInWarp] = make_float4(mat.topRight[2][1], mat.topRight[2][2], mat.bottomLeft[0][0], mat.bottomLeft[0][1]);
+		block.columns[5][threadIndexInWarp] = make_float4(mat.bottomLeft[0][2], mat.bottomLeft[1][0], mat.bottomLeft[1][1], mat.bottomLeft[1][2]);
+		block.columns[6][threadIndexInWarp] = make_float4(mat.bottomLeft[2][0], mat.bottomLeft[2][1], mat.bottomLeft[2][2], 0.f);
+	}
+
+	PX_FORCE_INLINE PX_CUDA_CALLABLE void loadSpatialMatrix(const PxgSpatialMatrix6x6Block& block, const PxU32 threadIndexInWarp, PxSpatialMatrix& mat)
+	{
+		float4 val = block.columns[0][threadIndexInWarp];
+		mat.column[0][0] = val.x; mat.column[0][1] = val.y; mat.column[0][2] = val.z; mat.column[0][3] = val.w;
+		val = block.columns[1][threadIndexInWarp];
+		mat.column[0][4] = val.x; mat.column[0][5] = val.y; mat.column[1][0] = val.z; mat.column[1][1] = val.w;
+		val = block.columns[2][threadIndexInWarp];
+		mat.column[1][2] = val.x; mat.column[1][3] = val.y; mat.column[1][4] = val.z; mat.column[1][5] = val.w;
+		val = block.columns[3][threadIndexInWarp];
+		mat.column[2][0] = val.x; mat.column[2][1] = val.y; mat.column[2][2] = val.z; mat.column[2][3] = val.w;
+		val = block.columns[4][threadIndexInWarp];
+		mat.column[2][4] = val.x; mat.column[2][5] = val.y; mat.column[3][0] = val.z; mat.column[3][1] = val.w;
+		val = block.columns[5][threadIndexInWarp];
+		mat.column[3][2] = val.x; mat.column[3][3] = val.y; mat.column[3][4] = val.z; mat.column[3][5] = val.w;
+		val = block.columns[6][threadIndexInWarp];
+		mat.column[4][0] = val.x; mat.column[4][1] = val.y; mat.column[4][2] = val.z; mat.column[4][3] = val.w;
+		val = block.columns[7][threadIndexInWarp];
+		mat.column[4][4] = val.x; mat.column[4][5] = val.y; mat.column[5][0] = val.z; mat.column[5][1] = val.w;
+		val = block.columns[8][threadIndexInWarp];
+		mat.column[5][2] = val.x; mat.column[5][3] = val.y; mat.column[5][4] = val.z; mat.column[5][5] = val.w;
+	}
+
+	PX_FORCE_INLINE PX_CUDA_CALLABLE void storeSpatialMatrix(PxgSpatialMatrix6x6Block& block, const PxU32 threadIndexInWarp, const PxSpatialMatrix& mat)
+	{
+		block.columns[0][threadIndexInWarp] = make_float4(mat.column[0][0], mat.column[0][1], mat.column[0][2], mat.column[0][3]);
+		block.columns[1][threadIndexInWarp] = make_float4(mat.column[0][4], mat.column[0][5], mat.column[1][0], mat.column[1][1]);
+		block.columns[2][threadIndexInWarp] = make_float4(mat.column[1][2], mat.column[1][3], mat.column[1][4], mat.column[1][5]);
+		block.columns[3][threadIndexInWarp] = make_float4(mat.column[2][0], mat.column[2][1], mat.column[2][2], mat.column[2][3]);
+		block.columns[4][threadIndexInWarp] = make_float4(mat.column[2][4], mat.column[2][5], mat.column[3][0], mat.column[3][1]);
+		block.columns[5][threadIndexInWarp] = make_float4(mat.column[3][2], mat.column[3][3], mat.column[3][4], mat.column[3][5]);
+		block.columns[6][threadIndexInWarp] = make_float4(mat.column[4][0], mat.column[4][1], mat.column[4][2], mat.column[4][3]);
+		block.columns[7][threadIndexInWarp] = make_float4(mat.column[4][4], mat.column[4][5], mat.column[5][0], mat.column[5][1]);
+		block.columns[8][threadIndexInWarp] = make_float4(mat.column[5][2], mat.column[5][3], mat.column[5][4], mat.column[5][5]);
+	}
+
+	PX_FORCE_INLINE PX_CUDA_CALLABLE void zeroSpatialMatrix(PxgSpatialMatrix6x6Block& block, const PxU32 threadIndexInWarp)
+	{
+		block.columns[0][threadIndexInWarp] = make_float4(0.f, 0.f, 0.f, 0.f);
+		block.columns[1][threadIndexInWarp] = make_float4(0.f, 0.f, 0.f, 0.f);
+		block.columns[2][threadIndexInWarp] = make_float4(0.f, 0.f, 0.f, 0.f);
+		block.columns[3][threadIndexInWarp] = make_float4(0.f, 0.f, 0.f, 0.f);
+		block.columns[4][threadIndexInWarp] = make_float4(0.f, 0.f, 0.f, 0.f);
+		block.columns[5][threadIndexInWarp] = make_float4(0.f, 0.f, 0.f, 0.f);
+		block.columns[6][threadIndexInWarp] = make_float4(0.f, 0.f, 0.f, 0.f);
+		block.columns[7][threadIndexInWarp] = make_float4(0.f, 0.f, 0.f, 0.f);
+		block.columns[8][threadIndexInWarp] = make_float4(0.f, 0.f, 0.f, 0.f);
+	}
+
+	PX_FORCE_INLINE PX_CUDA_CALLABLE Cm::SpatialVectorF loadSpatialVectorF(const PxgSpatialVectorBlock& block, const PxU32 threadIndexInWarp)
+	{
+		float4 top = block.mTopxyz_bx[threadIndexInWarp];
+		float2 bottom = block.mbyz[threadIndexInWarp];
+
+		return Cm::SpatialVectorF(PxVec3(top.x, top.y, top.z), PxVec3(top.w, bottom.x, bottom.y));
+	}
+
+	PX_FORCE_INLINE PX_CUDA_CALLABLE Cm::UnAlignedSpatialVector loadSpatialVector(const PxgSpatialVectorBlock& block, const PxU32 threadIndexInWarp)
+	{
+		float4 top = block.mTopxyz_bx[threadIndexInWarp];
+		float2 bottom = block.mbyz[threadIndexInWarp];
+
+		return Cm::UnAlignedSpatialVector(PxVec3(top.x, top.y, top.z), PxVec3(top.w, bottom.x, bottom.y));
+	}
+
+	PX_FORCE_INLINE PX_CUDA_CALLABLE void storeSpatialVector(PxgSpatialVectorBlock& block, const Cm::UnAlignedSpatialVector& v, const PxU32 threadIndexInWarp)
+	{
+		block.mTopxyz_bx[threadIndexInWarp] = make_float4(v.top.x, v.top.y, v.top.z, v.bottom.x);
+		block.mbyz[threadIndexInWarp] = make_float2(v.bottom.y, v.bottom.z);
+
+		/*Pxstcg(&block.mTopxyz_bx[threadIndexInWarp], make_float4(v.top.x, v.top.y, v.top.z, v.bottom.x));
+		Pxstcg(&block.mbyz[threadIndexInWarp], make_float2(v.bottom.y, v.bottom.z));*/
+	}
+
+	PX_FORCE_INLINE PX_CUDA_CALLABLE void addSpatialVector(PxgSpatialVectorBlock& block, const Cm::UnAlignedSpatialVector& v, const PxU32 threadIndexInWarp)
+	{
+		block.mTopxyz_bx[threadIndexInWarp] += make_float4(v.top.x, v.top.y, v.top.z, v.bottom.x);
+		block.mbyz[threadIndexInWarp] += make_float2(v.bottom.y, v.bottom.z);
+
+		/*Pxstcg(&block.mTopxyz_bx[threadIndexInWarp], make_float4(v.top.x, v.top.y, v.top.z, v.bottom.x));
+		Pxstcg(&block.mbyz[threadIndexInWarp], make_float2(v.bottom.y, v.bottom.z));*/
+	}
+
+#if PX_CUDA_COMPILER
+	PX_FORCE_INLINE __device__ void atomicAddSpatialVector(PxgSpatialVectorBlock& block, const Cm::UnAlignedSpatialVector& v, const PxU32 threadIndexInWarp)
+	{
+#if __CUDA_ARCH__ >= 200
+		atomicAdd(&block.mTopxyz_bx[threadIndexInWarp].x, v.top.x);
+		atomicAdd(&block.mTopxyz_bx[threadIndexInWarp].y, v.top.y);
+		atomicAdd(&block.mTopxyz_bx[threadIndexInWarp].z, v.top.z);
+
+		atomicAdd(&block.mTopxyz_bx[threadIndexInWarp].w, v.bottom.x);
+		atomicAdd(&block.mbyz[threadIndexInWarp].x, v.bottom.y);
+		atomicAdd(&block.mbyz[threadIndexInWarp].y, v.bottom.z);
+#else
+		PX_UNUSED(block);
+		PX_UNUSED(v);
+		PX_UNUSED(threadIndexInWarp);
+#endif
+
+		/*Pxstcg(&block.mTopxyz_bx[threadIndexInWarp], make_float4(v.top.x, v.top.y, v.top.z, v.bottom.x));
+		Pxstcg(&block.mbyz[threadIndexInWarp], make_float2(v.bottom.y, v.bottom.z));*/
+	}
+#endif
+
+	PX_FORCE_INLINE PX_CUDA_CALLABLE PxTransform loadSpatialTransform(const PxgSpatialTransformBlock& block, const PxU32 threadIndexInWarp)
+	{
+		const float4 q = block.q[threadIndexInWarp];
+		const float4 p = block.p[threadIndexInWarp];
+
+		return PxTransform(PxVec3(p.x, p.y, p.z), PxQuat(q.x, q.y, q.z, q.w));
+	}
+
+	PX_FORCE_INLINE PX_CUDA_CALLABLE void storeSpatialTransform(PxgSpatialTransformBlock& block, const PxU32 threadIndexInWarp, const PxTransform& transform)
+	{
+		block.q[threadIndexInWarp] = make_float4(transform.q.x, transform.q.y, transform.q.z, transform.q.w);
+		block.p[threadIndexInWarp] = make_float4(transform.p.x, transform.p.y, transform.p.z, 0.f);
+	}
+
+	PX_FORCE_INLINE PX_CUDA_CALLABLE PxQuat loadQuat(const float4* f, const PxU32 threadIndexInWarp)
+	{
+		float4 q = f[threadIndexInWarp];
+		return PxQuat(q.x, q.y, q.z, q.w);
+	}
+
+	PX_FORCE_INLINE PX_CUDA_CALLABLE PxVec3 loadPxVec3(const float4* f, const PxU32 threadIndexInWarp)
+	{
+		float4 v = f[threadIndexInWarp];
+		return PxVec3(v.x, v.y, v.z);
+	}
+
+	PX_FORCE_INLINE PX_CUDA_CALLABLE void loadPxMat33(const PxgMat33Block& block, const PxU32 threadIndexInWarp, PxMat33& mat)
+	{
+		float4 v0 = block.mCol0[threadIndexInWarp];
+		float4 v1 = block.mCol1[threadIndexInWarp];
+		float v2 = block.mCol2[threadIndexInWarp];
+
+		mat.column0.x = v0.x; mat.column0.y = v0.y; mat.column0.z = v0.z;
+		mat.column1.x = v0.w; mat.column1.y = v1.x; mat.column1.z = v1.y;
+		mat.column2.x = v1.z; mat.column2.y = v1.w; mat.column2.z = v2;
+	}
+
+	PX_FORCE_INLINE PX_CUDA_CALLABLE void storePxMat33(PxgMat33Block& block, const PxU32 threadIndexInWarp, const PxMat33& mat)
+	{
+		block.mCol0[threadIndexInWarp] = make_float4(mat.column0.x, mat.column0.y, mat.column0.z, mat.column1.x);
+		block.mCol1[threadIndexInWarp] = make_float4(mat.column1.y, mat.column1.z, mat.column2.x, mat.column2.y);
+		block.mCol2[threadIndexInWarp] = mat.column2.z;
+	}
+
+	PX_FORCE_INLINE PX_CUDA_CALLABLE Cm::UnAlignedSpatialVector loadSpatialVector(const Cm::UnAlignedSpatialVector* PX_RESTRICT vector)
+	{
+		size_t ptr = size_t(vector);
+		if (ptr & 0xf)
+		{
+			float2* top = reinterpret_cast<float2*>(ptr);
+			float2 val = *top;
+			float4 val2 = *(reinterpret_cast<float4*>(top+1));
+
+			return Cm::UnAlignedSpatialVector(PxVec3(val.x, val.y, val2.x), PxVec3(val2.y, val2.z, val2.w));
+		}
+		else
+		{
+			float4* top = reinterpret_cast<float4*>(ptr);
+			float4 val = *top;
+			float2 val2 = *(reinterpret_cast<float2*>(top + 1));
+
+			return Cm::UnAlignedSpatialVector(PxVec3(val.x, val.y, val.z), PxVec3(val.w, val2.x, val2.y));
+		}
+	}
+
+	PX_FORCE_INLINE PX_CUDA_CALLABLE void storeSpatialVector(Cm::UnAlignedSpatialVector* PX_RESTRICT vector,
+		const Cm::UnAlignedSpatialVector& src)
+	{
+		size_t ptr = size_t(vector);
+		if (ptr & 0xf)
+		{
+			float2* top = reinterpret_cast<float2*>(ptr);
+			float4* bottom = (reinterpret_cast<float4*>(top + 1));
+
+			*top = make_float2(src.top.x, src.top.y);
+			*bottom = make_float4(src.top.z, src.bottom.x, src.bottom.y, src.bottom.z);
+		}
+		else
+		{
+			float4* top = reinterpret_cast<float4*>(ptr);
+			float2* bottom = (reinterpret_cast<float2*>(top + 1));
+
+			*top = make_float4(src.top.x, src.top.y, src.top.z, src.bottom.x);
+			*bottom = make_float2(src.bottom.y, src.bottom.z);
+		}
+	}
+
+
+	struct PxgArticulationBitFieldStackData
+	{
+		PxU64 bitField[32];
+	};
+
+	typedef PxgArticulationBitFieldStackData PxgArticulationBitFieldData;
+
+	struct PxgArticulationTraversalStackData
+	{
+		PxU32 indices[32];
+		PxgSpatialVectorBlock impulseStack;
+		PxgSpatialVectorBlock deltaVStack;
+	};
+
+	struct PxgArticulationBlockLinkData
+	{
+		PxgSpatialMatrixBlock			mSpatialArticulatedInertia;
+		PxgSpatialMatrix6x6Block		mSpatialResponseMatrix;
+		PxgMat33Block					mIsolatedInertia;
+		PxReal							mMass[32];
+
+		//KS - these are questionable - we might want to store velocity/deltaVelocity etc. in a non-SOA format
+		//for the constraint solver. To be revisited later!!!!
+		PxgSpatialVectorBlock			mMotionVelocity;
+		PxgSpatialVectorBlock			mPosMotionVelocity;
+		PxgSpatialVectorBlock			mDeltaMotion;
+		PxgSpatialVectorBlock			mScratchImpulse; // These are non-propagated impulses. They may persist between kernels and are properly processed and then reset in averageLinkImpulsesAndPropagate. Should be reset to zero after propagation.
+		PxgSpatialVectorBlock			mScratchDeltaV; // Temp! Used only for propagating velocities around the tree structure within a kernel! Left in an undefined state after use.
+
+		PxgSpatialVectorBlock			mSolverSpatialDeltaVel;
+		PxgSpatialVectorBlock			mSolverSpatialImpulse;
+		PxgSpatialVectorBlock			mSolverSpatialInternalConstraintImpulse;
+
+		PxgSpatialVectorBlock			mZAVector;
+		PxgSpatialVectorBlock			mZAIntVector;
+		PxgSpatialVectorBlock			mCoriolis;
+		PxgSpatialVectorBlock			mConstraintForces;
+
+		PxgSpatialVectorBlock			mMotionAcceleration;
+		PxgSpatialVectorBlock			mMotionAccelerationInternal;
+		PxgSpatialVectorBlock			mBiasForce;
+
+		PxReal							mDeltaScale[32];
+
+		float							mRw_x[32];
+		float							mRw_y[32];
+		float							mRw_z[32];
+		float4							mRelativeQuat[32];
+
+		PxU32							mNbStaticContacts[32];
+		PxU32							mStaticContactStartIndex[32];
+
+		PxU32							mNbStaticJoints[32];
+		PxU32							mStaticJointStartIndex[32];
+
+		PxgSpatialTransformBlock		mPreTransform;
+		PxgSpatialTransformBlock		mAccumulatedPose;
+		PxgSpatialTransformBlock		mChildPose;
+		PxgSpatialTransformBlock		mParentPose;
+		float4							mDeltaQ[32];
+		float4							mLinDampingX_AngDampingY_maxLinVelSqZ_maxAngVelSqW[32];
+		float4							mInvInertiaXYZ_invMassW[32];
+		PxReal							mCfm[32];
+		PxU32							mChildrenOffset[32];
+		PxU32							mNumChildren[32];
+
+		PxU32							mJointOffset[32];
+		PxU32							mParents[32];
+		PxU8							mDofs[32];
+		PxU8							mJointType[32];
+		PxU8							mInvDofIds[6][32];			//mapping from axis to joint offset
+		PxU8							mDisableGravity[32];
+		PxU8							mRetainsAcceleration[32];
+
+		// PT: padded to 128 for coalesced loads
+		PxU32							pad[16];
+	};
+	PX_COMPILE_TIME_ASSERT((sizeof(PxgArticulationBlockLinkData) & 127)==0);
+
+	struct PxgArticulationBlockSpatialTendonData
+	{
+	public:
+		PxU32	mNumAttachments[32];
+		PxU32	mNumConstraints[32];
+		PxReal	mStiffness[32];
+		PxReal	mDamping[32];
+		PxReal	mLimitStiffness[32];
+		PxReal  mOffset[32];
+	};
+
+	struct PxgArticulationBlockFixedTendonData
+	{
+	public:
+		PxU32	mNumTendonJoints[32];
+		PxU32	mNumConstraints[32];
+		PxReal	mStiffness[32];
+		PxReal	mDamping[32];
+		PxReal	mLimitStiffness[32];
+		PxReal  mOffset[32];
+		PxReal	mLowLimit[32];
+		PxReal  mHighLimit[32];
+		PxReal	mRestLength[32];
+	};
+
+	struct PxgArtiStateDirtyFlag
+	{
+		enum
+		{
+			eVEL_DIRTY = 1 << 0,
+			eHAS_IMPULSES = 1 << 1
+		};
+	};
+
+	//This class stores a block of data corresponding to 32 articulations.
+	//It is filled in at runtime. 
+	struct PxgArticulationBlockData
+	{
+		PxgSpatialMatrixBlock mInvSpatialArticulatedInertia;
+		PxU32 mFlags[32];		// PT: seems to be mainly for PxArticulationFlag::eFIX_BASE so only 1 bit is needed? Merge with mNumLinks?
+		PxU32 mNumLinks[32];
+		PxU32 mNumSpatialTendons[32];
+		PxU32 mNumFixedTendons[32];
+		PxU32 mNumMimicJoints[32];
+		PxU32 mTotalDofs[32];
+		PxU32 mArticulationIndex[32];
+		PxReal mSleepThreshold[32];
+
+		//This one is a bit of a hack - it's the common link velocity. 
+		PxgSpatialVectorBlock mCommonLinkDeltaVelocity;
+		PxU32 mLinkWithDeferredImpulse[32];
+		Cm::UnAlignedSpatialVector* mMotionVelocitiesPtr[32];
+		PxgSpatialVectorBlock mRootDeferredZ;
+
+		float4	mCOM_TotalInvMassW[32];
+
+		PxU8  mStateDirty[32];				//32 bytes
+		PxU32 mTotalSelfConstraintCount;	//4  bytes
+		PxU32 mSelfConstraintOffset;		//4 bytes
+		// PT: padded to 128 for coalesced loads
+		PxU32 pad[22];						//88 bytes padding, bringing this all up to 128 bytes
+	};
+	PX_COMPILE_TIME_ASSERT((sizeof(PxgArticulationBlockData) & 127)==0);
+
+	struct PxgArticulationInternalConstraintData
+	{
+		PxgSpatialVectorBlock		mRow0;
+		PxgSpatialVectorBlock		mRow1;
+		PxgSpatialVectorBlock		mDeltaVA;
+		PxgSpatialVectorBlock		mDeltaVB;
+
+		PxReal						mRecipResponse[32];
+		PxReal						mResponse[32];
+
+		float2						mLimits_LowLimitX_highLimitY[32];
+		float2						mLimitError_LowX_highY[32];
+		PxReal						mHighImpulse[32];
+		PxReal						mLowImpulse[32];
+		PxReal						mMaxJointVelocity[32];
+		
+		// drive 
+		PxReal						mDriveImpulse[32];
+		PxReal						mConstraintMaxImpulse[32];
+		PxReal						mMaxForce[32];
+		PxU32						mDriveType[32];
+		PxReal						mMaxEffort[32];
+		PxReal						mMaxActuatorVelocity[32];
+		PxReal						mVelocityDependentResistance[32];
+		PxReal						mSpeedEffortGradient[32];
+
+		PxReal						mDriveTargetPos[32];
+		PxReal						mArmature[32];
+
+		float4                      mTargetVelPlusInitialBiasX_DriveBiasCoefficientY_VelMultiplierZ_ImpulseMultiplierW[32];
+
+		float						mDriveStiffness[32];
+		float						mDamping[32];
+		float						mDriveTargetVel[32];
+		float						mTargetPosBias[32];
+
+		PxReal						mAccumulatedFrictionImpulse[32];
+
+		//old friction
+		PxReal						mMaxFrictionForce[32];
+		PxReal						mFrictionCoefficient[32];
+
+		//new friction
+		PxReal						mStaticFrictionEffort[32];
+		PxReal						mDynamicFrictionEffort[32];
+		PxReal						mViscousFrictionCoefficient[32];
+
+		PX_FORCE_INLINE void PX_CUDA_CALLABLE setImplicitDriveDesc
+			(const PxU32 threadIndexInWarp,const Dy::ArticulationImplicitDriveDesc& driveDesc)
+		{
+		    mTargetVelPlusInitialBiasX_DriveBiasCoefficientY_VelMultiplierZ_ImpulseMultiplierW[threadIndexInWarp] =
+		        make_float4(driveDesc.driveTargetVelPlusInitialBias, driveDesc.driveBiasCoefficient, driveDesc.driveVelMultiplier, driveDesc.driveImpulseMultiplier);
+		    mTargetPosBias[threadIndexInWarp] = driveDesc.driveTargetPosBias;
+		}
+		PX_FORCE_INLINE PX_CUDA_CALLABLE Dy::ArticulationImplicitDriveDesc getImplicitDriveDesc(const PxU32 threadIndexInWarp) const
+		{
+			const Dy::ArticulationImplicitDriveDesc driveDesc
+			(
+		        mTargetVelPlusInitialBiasX_DriveBiasCoefficientY_VelMultiplierZ_ImpulseMultiplierW[threadIndexInWarp].x,
+		        mTargetVelPlusInitialBiasX_DriveBiasCoefficientY_VelMultiplierZ_ImpulseMultiplierW[threadIndexInWarp].y,
+		        mTargetVelPlusInitialBiasX_DriveBiasCoefficientY_VelMultiplierZ_ImpulseMultiplierW[threadIndexInWarp].z,
+		        mTargetVelPlusInitialBiasX_DriveBiasCoefficientY_VelMultiplierZ_ImpulseMultiplierW[threadIndexInWarp].w,
+				mTargetPosBias[threadIndexInWarp]
+			);
+			return driveDesc;
+		}
+	};
+	PX_COMPILE_TIME_ASSERT((sizeof(PxgArticulationInternalConstraintData) & 127)==0);
+
+	struct PxgArticulationInternalTendonConstraintData
+	{
+		PxgSpatialVectorBlock		mRow0;
+		PxgSpatialVectorBlock		mRow1;
+		
+		PxgSpatialVectorBlock		mDeltaVB;
+
+		PxReal						mRecipResponse[32];	
+		PxReal						mAccumulatedLength[32];
+		PxReal						mBiasCoefficient[32];
+		PxReal						mVelMultiplier[32];
+		PxReal						mImpulseMultiplier[32];
+		PxReal						mAppliedForce[32];
+
+		PxReal						mLimitBiasCoefficient[32];
+		PxReal						mLimitImpulseMultiplier[32];
+		PxReal						mLimitAppliedForce[32];
+
+		PxU32						mLink0[32];
+		PxU32						mLink1[32];
+		PxReal						mDeltaVA[32];
+
+		PxReal						mRestDistance[32];
+		PxReal						mLowLimit[32];
+		PxReal						mHighLimit[32];
+	};
+
+	struct PxgArticulationBlockDofData
+	{
+		PxReal						mJointPositions[32];
+		PxReal						mJointVelocities[32];
+		PxReal						mJointUnconstrainedVelocities[32]; 
+		PxReal						mPosJointVelocities[32];
+		PxReal						mJointAccel[32];
+		PxReal						mQstZ[32];
+		PxReal						mQstZIcInternal[32];
+		PxReal						mDeferredQstZ[32];
+		PxReal						mTmpQstZ[32];
+
+		PxgSpatialVectorBlock		mWorldMotionMatrix;
+		PxgSpatialVectorBlock		mLocalMotionMatrix;
+		PxgSpatialVectorBlock		mIsInvDW;
+		PxgSpatialVectorBlock		mIsW;
+		PxgSpatialVectorBlock		mJointAxis;
+		float						mInvStIsT_x[32];
+		float						mInvStIsT_y[32];
+		float						mInvStIsT_z[32];
+		PxU8						mDofIds[32];			//mapping from joint offset to axis
+		PxU8						mMotion[32];
+
+		PxgArticulationInternalConstraintData mConstraintData;
+
+		// PT: padded to 128 for coalesced loads
+		PxU32							pad[16];
+	};
+	PX_COMPILE_TIME_ASSERT((sizeof(PxgArticulationBlockDofData) & 127)==0);
+
+	struct PxgArticulationBlockAttachmentData
+	{
+	public:
+		PxVec3	mRelativeOffset[32];
+		PxReal	mRestDistance[32];
+		PxReal	mCoefficient[32];
+
+		PxU64	mChildrens[32];
+		PxU16	mLinkIndex[32];
+		PxU32	mParents[32];
+		PxReal	mLowLimit[32];
+		PxReal	mHighLimit[32];	
+	};
+
+	struct PxgArticulationBlockTendonJointData
+	{
+	public:
+		PxU32						mAxis[32];
+		PxReal						mCoefficient[32];
+		PxReal						mRecipCoefficient[32];
+
+		PxU64						mChildrens[32];
+		PxU16						mLinkIndex[32];
+		PxU32						mParents[32];
+		PxU32						mConstraintId[32];
+	};
+
+	struct PxgArticulationBlockInternallMimicJointData
+	{
+		PxU32 mDofA[32];
+		PxU32 mDofB[32];
+		//Cache recip effectiveInertia = [J * M^-1 * J^T]  =  [rAA + gearRatio*(rAB + rBA) + gearRatio*gearRatio*rBB]
+		//Impulse = [1, gearRatio]^T * [-b + J*v] /[J * M^-1 * J^T + cfm]
+		//Impulse = [1, gearRatio]^T * [-b + J*v] / [recipEffectiveInertia + cfm];
+		PxReal recipEffectiveInertia[32];
+	};
+
+	struct PxgArticulationBlockMimicJointData
+	{
+		PxU32 mLinkA[32];
+		PxU32 mAxisA[32];
+		PxU32 mLinkB[32];
+		PxU32 mAxisB[32];
+		PxReal mGearRatio[32];
+		PxReal mOffset[32];
+		PxReal mNaturalFrequency[32];
+		PxReal mDampingRatio[32];
+		PxgArticulationBlockInternallMimicJointData mInternalData;
+	};
+
+	struct PxgArticulationData
+	{
+		// index in the articulation buffer. This is the same as articulationRemapId in the PxgBodySim
+		PxU32					index;							
+		PxU32					bodySimIndex;					
+		PxU32					gpuDirtyFlag;					
+		PxU32					updateDirty;					
+
+		PxU16					numLinks;						
+		PxU16					numJointDofs;					
+		PxU16					numSpatialTendons;				
+
+		PxU16					numFixedTendons;				
+		PxU16					numMimicJoints;					
+		PxU8					flags;							
+		bool					confiDirty;						
+
+		PxU32					numPathToRoots;					
+	};
+
+#if PX_VC
+#pragma warning(push)
+#pragma warning(disable:4324)
+#endif
+	PX_ALIGN_PREFIX(16)
+	class PxgArticulation
+	{
+	public:
+	
+		PxgArticulationData						data;							
+
+		PxgArticulationLink*					links;							
+		Dy::ArticulationJointCore*				joints;							
+		Dy::ArticulationJointCoreData*			jointData;						
+		
+		Cm::UnAlignedSpatialVector*				motionVelocities;				
+		Cm::UnAlignedSpatialVector*				motionAccelerations;			
+		Cm::UnAlignedSpatialVector*				linkIncomingJointForces;	
+		Cm::UnAlignedSpatialVector*				corioliseVectors;				
+		Cm::UnAlignedSpatialVector*				zAForces; // used as temporary propagation buffer in inverseDynamics and to store TGS per substep isolated forces while the solver runs. Not cleared after use.
+		
+		Cm::UnAlignedSpatialVector*				externalAccelerations;			
+
+		Cm::UnAlignedSpatialVector*				rootPreMotionVelocity;
+
+		PxReal*									jointPositions;					
+		PxReal*									jointVelocities;				
+		PxReal*									jointAccelerations;				
+		PxReal*									jointForce;						
+		PxReal*									jointTargetPositions;
+		PxReal*									jointTargetVelocities;
+		
+		PxU32*									jointOffsets;					
+
+		PxSpatialMatrix*						worldSpatialArticulatedInertia;	
+		PxSpatialMatrix*						spatialResponseMatrixW;
+		PxTransform*							linkBody2Worlds;	
+		PxU8*									linkJointRootStateDataBuffer;			
+		PxTransform*							linkBody2Actors;
+		PxU32*									parents;						
+		//Local space motion matrix - constant unless properties are changed
+		Dy::SpatialSubspaceMatrix*				motionMatrix;					
+		//World space motion matrix - computed from local matrix each frame
+		Dy::SpatialSubspaceMatrix*				worldMotionMatrix; // AD: only inverse dynamics now.				
+
+		Cm::UnAlignedSpatialVector*				jointAxis;
+
+		PxReal*									linkWakeCounters;
+		PxgArticulationLinkSleepData*			linkSleepData;
+		PxgArticulationLinkProp*				linkProps;						
+		ArticulationBitField*					children;
+		PxU32*									pathToRoot;
+
+		PxQuat*									relativeQuat;
+
+		PxQuat*									tempParentToChilds;
+		PxVec3*									tempRs;
+
+		PxGpuSpatialTendonData*					spatialTendonParams;
+		PxgArticulationTendon*					spatialTendons;
+
+		PxGpuFixedTendonData*					fixedTendonParams;
+		PxgArticulationTendon*					fixedTendons;
+
+		PxReal*									cfms;
+		PxReal*									cfmScale;
+
+		Dy::ArticulationMimicJointCore*			mimicJointCores;
+
+		PX_ALIGN(16, PxSpatialMatrix)			invSpatialArticulatedInertiaW;
+
+		Dy::ErrorAccumulator					internalResidualAccumulator; //Internal residual means no errors introduces by contacts or non-articulation joints connected to this instance will be included
+		Dy::ErrorAccumulator					contactResidualAccumulator;
+	}
+	PX_ALIGN_SUFFIX(16);
+#if PX_VC
+#pragma warning(pop)
+#endif
+
+	/*
+	\brief We aggregate link, joint, and root link state data into a single char buffer.
+	We do this in two ways:
+	a) a single buffer for each articulation
+	b) a single buffer for all articulations
+	The typical pattern of data flow is as follows:
+	a) we store state data in a unique device buffer for each articulation.
+	b) we copy from the individual device buffers per articulation to the single device buffer for all articulations.
+	c) we copy the single buffer for all articulations from device to host 
+	d) we distribute state data from the single host buffer on the host to the individual articulation instances on the host.
+	The state data that we store is as follows:
+	a) link body2Worlds array, link velocities array, link accelerations array, link incoming joint forces array
+	b) joint positions array, joint velocities array, joint accelerations array
+	d) root link pre-sim velocity
+	The struct PxgLinkJointRootStateData contains helper functions for allocating and querying 
+	state data buffers.
+	*/
+	struct PxgArticulationLinkJointRootStateData
+	{/**
+		\brief Compute the number of bytes required for an articulation with known 
+		maximum link count and known maximum dof count.
+		\param[in] maxNbLinks is the maximum number of links of any articulation in the ensemble of articulations.
+		\param[in] maxNbDofs is the maximum number of dofs of any articulation in the ensemble of articulations.
+		\note This does not return an aligned size, use computeStateDataBufferByteSizeAligned16 for that purpose
+		\return The number of bytes required to store the state data for an articulation.
+		*/
+		static PX_CUDA_CALLABLE PX_FORCE_INLINE PxU32 computeSingleArticulationStateDataBufferByteSizeRaw
+		(const PxU32 maxNbLinks, const PxU32 maxNbDofs)
+		{
+			PxU32 byteSizePerArt =
+				(sizeof(PxTransform) + 3 * sizeof(Cm::UnAlignedSpatialVector)) * maxNbLinks;		//link pose + link velocity + link acceleration + link incoming joint force
+			byteSizePerArt += sizeof(PxReal) * maxNbDofs; //joint pos
+			byteSizePerArt += sizeof(PxReal) * maxNbDofs; //joint vel
+			byteSizePerArt += sizeof(PxReal) * maxNbDofs; //joint accel
+			byteSizePerArt += sizeof(Cm::UnAlignedSpatialVector);											//root pre-sim vel
+			return byteSizePerArt;
+		}
+
+		/**
+		\brief Compute the number of bytes required for an ensemble of articulations with known 
+		maximum link count and known maximum dof count.
+		\param[in] maxNbLinks is the maximum number of links of any articulation in the ensemble of articulations.
+		\param[in] maxNbDofs is the maximum number of dofs of any articulation in the ensemble of articulations.
+		\param[in] nbArticulations is the number of articulations in the ensemble. 
+		\note This may be used to compute the number of bytes required for a single articulation by setting 
+		nbArticulations to 1 and setting maxNbLinks etc to be the link and dof count of that articulation.
+		\return The number of bytes required to store the state data for an ensemble of articulations.
+		*/
+		static PX_CUDA_CALLABLE PX_FORCE_INLINE PxU32 computeStateDataBufferByteSizeAligned16
+		(const PxU32 maxNbLinks, const PxU32 maxNbDofs, const PxU32 nbArticulations)
+		{
+			const PxU32 byteSizePerArt = computeSingleArticulationStateDataBufferByteSizeRaw(maxNbLinks, maxNbDofs);
+			const PxU32 byteSize16PerArt = ((byteSizePerArt + 15) & ~15);						//align the size upwards to the next 16-byte boundary
+			return (byteSize16PerArt * nbArticulations);											
+		}
+
+		/**
+		\brief Return the pointer to a single articulation's state data buffer.
+		\param[in] inputBufferForAllArticulations is a pointer to the memory containing the state
+		data for the entire ensemble of articulations.
+		\param[in] maxNbLinks is the maximum number of links of any articulation in the ensemble of articulations.
+		\param[in] maxNbDofs is the maximum number of dofs of any articulation in the ensemble of articulations.
+		\param[in] articulationId is the index of a single articulation within the ensemble. 
+		\return The pointer to a single articulation's state data buffer.
+		*/
+		static PX_CUDA_CALLABLE PX_FORCE_INLINE PxU8* getArticulationStateDataBuffer
+		(PxU8* inputBufferForAllArticulations,
+		 const PxU32 maxNbLinks, const PxU32 maxNbDofs,
+		 const PxU32 articulationId)
+		{
+			PxU8* singleArticulationStateBuffer =
+				inputBufferForAllArticulations + 
+				computeStateDataBufferByteSizeAligned16(maxNbLinks, maxNbDofs, articulationId);
+
+			return singleArticulationStateBuffer;
+		}
+
+		/**
+		\brief Decompose the state data buffer of a single articulation into pointers to arrays of 
+		poses, velocities etc.
+		\param[in] singleArticulationStateBuffer is the state data buffer for a single articulation.
+		\param[in] nbLinks is the number of links for the single articulation. 
+		\param[in] nbDofs is the number of dofs for the single articulation. 
+		\param[out] linkBody2Worlds is an array of link poses with one element per link.
+		\param[out] linkVels is a pointer to an array of link spatial velocities with one element per link.
+		\param[out] linkIncomingJointForces is a pointer to an array of link incoming joint forces with one element per link.
+		\param[out] jointPositions is a pointer to an array of joint positions with one element per dof.
+		\param[out] jointVelocities is a pointer to an array of joint velocities with one element per link.
+		\param[out] jointAccelerations is a pointer to an array of joint accelerations with one element per dof.
+		\param[out] rootPreVel is a pointer to the pre-sim velocity of the single articulation's root link. 
+		*/
+		static PX_CUDA_CALLABLE PX_FORCE_INLINE void decomposeArticulationStateDataBuffer
+		(PxU8* singleArticulationStateBuffer,
+		 const PxU32 nbLinks, const PxU32 nbDofs,
+		 PxTransform*& linkBody2Worlds, Cm::UnAlignedSpatialVector*& linkVels, Cm::UnAlignedSpatialVector*& linkAccels, Cm::UnAlignedSpatialVector*& linkIncomingJointForces,
+		 PxReal*& jointPositions, PxReal*& jointVelocities, PxReal*& jointAccelerations,
+		 Cm::UnAlignedSpatialVector*& rootPreVel)
+		{
+			PxU8* buffer = singleArticulationStateBuffer;
+			linkBody2Worlds = reinterpret_cast<PxTransform*>(buffer);
+			buffer += sizeof(PxTransform) * nbLinks;
+			linkVels = reinterpret_cast<Cm::UnAlignedSpatialVector*>(buffer);
+			buffer += sizeof(Cm::UnAlignedSpatialVector) * nbLinks;
+			linkAccels = reinterpret_cast<Cm::UnAlignedSpatialVector*>(buffer);
+			buffer += sizeof(Cm::UnAlignedSpatialVector) * nbLinks;
+			linkIncomingJointForces = reinterpret_cast<Cm::UnAlignedSpatialVector*>(buffer);
+			buffer += sizeof(Cm::UnAlignedSpatialVector) * nbLinks;
+			jointPositions = reinterpret_cast<PxReal*>(buffer);
+			buffer += sizeof(PxReal) * nbDofs;
+			jointVelocities = reinterpret_cast<PxReal*>(buffer);
+			buffer += sizeof(PxReal) * nbDofs;
+			jointAccelerations = reinterpret_cast<PxReal*>(buffer);
+			buffer += sizeof(PxReal) * nbDofs;
+			rootPreVel =  reinterpret_cast<Cm::UnAlignedSpatialVector*>(buffer);
+			PX_ASSERT(
+				singleArticulationStateBuffer + computeStateDataBufferByteSizeAligned16(nbLinks, nbDofs, 1) == 
+				reinterpret_cast<PxU8*>(((reinterpret_cast<size_t>(buffer) + sizeof(Cm::UnAlignedSpatialVector) + 15) & ~15)));
+		}
+
+		/**
+		\brief Compute the pointer to the array of link poses for a single articulation.
+		\param[in] singleArticulationStateBuffer is the state data buffer for a single articulation.
+		\param[in] nbLinks is the number of links for the single articulation. 
+		\param[in] nbDofs is the number of dofs for the single articulation. 
+		\return The pointer to the array of link poses for a single articulation. 
+		*/
+		//Get the body2World array from the state data buffer of a single articulation.
+		static PX_CUDA_CALLABLE PX_FORCE_INLINE PxTransform* getArticulationBody2Worlds
+		(PxU8* singleArticulationStateBuffer, 
+		 const PxU32 nbLinks, const PxU32 nbDofs)
+		{
+			PxTransform* linkBody2Worlds;
+			Cm::UnAlignedSpatialVector* linkVels;
+			Cm::UnAlignedSpatialVector* linkAccels;
+			Cm::UnAlignedSpatialVector* linkIncomingJointForces;
+			PxReal* jointPositions;
+			PxReal* jointVelocities;
+			PxReal* jointAccelerations;
+			Cm::UnAlignedSpatialVector* rootPreVel;
+
+			decomposeArticulationStateDataBuffer(
+				singleArticulationStateBuffer,
+				nbLinks, nbDofs, 
+				linkBody2Worlds, linkVels, linkAccels, linkIncomingJointForces,
+				jointPositions, jointVelocities, jointAccelerations,
+				rootPreVel);
+	
+			return linkBody2Worlds;
+		}
+	};
+
+	
+	class PxgArticulationBuffer : public PxUserAllocated
+	{
+	public:
+
+		PxgArticulationBuffer(PxgHeapMemoryAllocatorManager* heapMemoryManager);
+
+		~PxgArticulationBuffer();
+
+		PxgTypedCudaBuffer<PxgArticulationLink>					links;
+		PxgTypedCudaBuffer<PxReal>								linkWakeCounters;       //original set to the same as articulation wakeCounter
+		PxgTypedCudaBuffer<PxgArticulationLinkSleepData>		linkSleepData;
+		PxgTypedCudaBuffer<PxgArticulationLinkProp>				linkProps;
+		PxgTypedCudaBuffer<Dy::ArticulationJointCore>			joints;
+		PxgTypedCudaBuffer<Dy::ArticulationJointCoreData>		jointData;
+		PxgTypedCudaBuffer<Cm::UnAlignedSpatialVector>			corioliseVectors;       //link coriolise vector
+		PxgTypedCudaBuffer<Cm::UnAlignedSpatialVector>			zAForces;               //link spatial zero acceleration force/ spatical articulate 
+		PxgTypedCudaBuffer<PxU32>								pathToRoots;            //global array store path to root for each link in continuous. Each link should have a start index and numberOfElems
+
+		PxgTypedCudaBuffer<PxGpuSpatialTendonData>				spatialTendonParams;
+		PxgTypedCudaBuffer<PxgArticulationTendon>				spatialTendons;
+		PxArray<PxgCudaBuffer*>									attachmentFixedData;
+		PxArray<PxgCudaBuffer*>									attachmentModData;
+
+		PxgTypedCudaBuffer<PxGpuFixedTendonData>				fixedTendonParams;
+		PxgTypedCudaBuffer<PxgArticulationTendon>				fixedTendons;
+		PxArray<PxgCudaBuffer*>									tendonJointFixData;
+		PxArray<PxgCudaBuffer*>									tendonJointCoefficientData;
+
+		PxgTypedCudaBuffer<Dy::ArticulationMimicJointCore>		mimicJoints;
+
+		PxgTypedCudaBuffer<Cm::UnAlignedSpatialVector>			externalAccelerations;
+
+		PxgTypedCudaBuffer<PxReal>								jointForce;
+		PxgTypedCudaBuffer<PxReal>								jointTargetPositions;
+		PxgTypedCudaBuffer<PxReal>								jointTargetVelocities;
+		PxgTypedCudaBuffer<PxU32>								jointOffsets;
+		PxgTypedCudaBuffer<PxU32>								parents;
+		PxgTypedCudaBuffer<Dy::SpatialSubspaceMatrix>			motionMatrix;
+		PxgTypedCudaBuffer<Dy::SpatialSubspaceMatrix>			motionMatrixW;
+		PxgTypedCudaBuffer<Cm::UnAlignedSpatialVector>			jointAxis;
+
+		PxgTypedCudaBuffer<PxSpatialMatrix>						spatialArticulatedInertiaW;
+		PxgTypedCudaBuffer<PxSpatialMatrix>						spatialImpulseResponseW;
+
+		//see PxgArticulationLinkJointRootStateData
+		PxgCudaBuffer											linkAndJointAndRootStates;
+
+		PxgTypedCudaBuffer<PxTransform>							linkBody2Actors;
+
+		PxgTypedCudaBuffer<ArticulationBitField>				children;
+														 
+		PxgTypedCudaBuffer<PxQuat>								relativeQuats;
+		PxgTypedCudaBuffer<PxReal>								cfms;
+		PxgTypedCudaBuffer<PxReal>								cfmScale;
+
+		PxgTypedCudaBuffer<PxQuat>								tempParentToChilds;
+		PxgTypedCudaBuffer<PxVec3>								tempRs; 
+
+		PxU32													linkCount;
+
+		PxgHeapMemoryAllocatorManager*							mHeapMemoryManager;
+	};
+
+	//Helper function to compute the index of a particular link's deltaV value in the deltaV buffer.
+	//We store this in a particular order to try and minimize cache misses
+	PX_FORCE_INLINE PX_CUDA_CALLABLE PxU32 computeDeltaVIndex(const PxU32 maxArticulations, const PxU32 maxLinks, 
+		const PxU32 articulationID, const PxU32 linkID, const PxU32 slabID)
+	{
+		return articulationID + linkID * maxArticulations + slabID*maxArticulations*maxLinks;
+	}
+
+}
+
+#endif