XCEngine/engine/third_party/physx/source/gpuarticulation/src/CUDA/articulationDirectGpuApi.cu

// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of NVIDIA CORPORATION nor the names of its
//    contributors may be used to endorse or promote products derived
//    from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.

#include "PxDirectGPUAPI.h"
#include "foundation/PxSimpleTypes.h"
#include "foundation/PxVec3.h"
#include "foundation/PxTransform.h"
#include "foundation/PxMath.h"

#include "PxArticulationFlag.h"
#include "PxArticulationTendonData.h"
#include "PxgArticulation.h"
#include "PxgArticulationLink.h"
#include "PxgArticulationCoreDesc.h"
#include "PxgShapeSim.h"
#include "PxsTransformCache.h"


#include "DyArticulationCore.h"
#include "DyArticulationJointCore.h"
#include "DyFeatherstoneArticulation.h"
#include "DyFeatherstoneArticulationUtils.h"
#include "DyFeatherstoneArticulationJointData.h"

#include "CmSpatialVector.h"

#include "utils.cuh"
#include "reduction.cuh"
#include "updateCacheAndBound.cuh"

#include <assert.h>

using namespace physx;
using namespace Dy;

extern "C" __host__ void initArticulationKernels3() {}

template<bool zeroSimOutput>
static PX_FORCE_INLINE __device__ void updateKinematicInternal(
	const PxgArticulation& articulation,
	const PxU32 threadIndexInWarp,
	const PxgShapeSim* PX_RESTRICT gShapeSimPool,
	const PxgShape* PX_RESTRICT gConvexShapes,
	PxsCachedTransform* PX_RESTRICT gTransformCache,
	const PxNodeIndex* PX_RESTRICT gRigidNodeIndices,//sorted rigid body node index
	const PxU32* PX_RESTRICT gShapeIndices,//the corresponding shape index
	const PxU32 numShapes,
	PxBounds3* PX_RESTRICT bounds)
{
	const PxgArticulationData artiData = articulation.data;
	const PxU32 linkCount = artiData.numLinks;

	const PxU32 bodySimIndex = artiData.bodySimIndex;

	PxQuat* tempNewParentToChilds = articulation.tempParentToChilds;
	PxVec3* tempRs = articulation.tempRs;

	PxTransform* body2Worlds = articulation.linkBody2Worlds;
	ArticulationJointCore* joints = articulation.joints;
	ArticulationJointCoreData* jointData = articulation.jointData;

	const PxQuat* const PX_RESTRICT relativeQuats = articulation.relativeQuat;
	const PxU32* const PX_RESTRICT parents = articulation.parents;

	const SpatialSubspaceMatrix* const PX_RESTRICT motionMatrix = articulation.motionMatrix;

	//each thread deals with a joint
	for (PxU32 linkID = 1 + threadIndexInWarp; linkID < linkCount; linkID += WARP_SIZE)
	{
		const ArticulationJointCoreData& jointDatum = jointData[linkID];

		const ArticulationJointCore& joint = joints[linkID];

		const PxReal* jPosition = &articulation.jointPositions[jointDatum.jointOffset];

		PxQuat& newParentToChild = tempNewParentToChilds[linkID];
		PxVec3& r = tempRs[linkID];

		const PxVec3 childOffset = -joint.childPose.p;
		const PxVec3 parentOffset = joint.parentPose.p;

		const PxQuat relativeQuat = relativeQuats[linkID];

		PxVec3 e, d;

		switch (joint.jointType)
		{
		case PxArticulationJointType::ePRISMATIC:
		{
			newParentToChild = relativeQuat;
			const PxVec3& u = motionMatrix[linkID][0].bottom;
			const PxVec3 e = newParentToChild.rotate(parentOffset);
			const PxVec3 d = childOffset;

			r = e + d + u * jPosition[0];

			break;
		}
		case PxArticulationJointType::eREVOLUTE:
		case PxArticulationJointType::eREVOLUTE_UNWRAPPED:
		{
			const PxVec3& u = motionMatrix[linkID][0].top;

			PxQuat jointRotation = PxQuat(-jPosition[0], u);
			if (jointRotation.w < 0)	//shortest angle.
				jointRotation = -jointRotation;

			/*printf("LinkID %i: jointRotation = (%f, %f, %f, %f), joint->relativeQuat = (%f, %f, %f,%f)\n", linkID, jointRotation.x, jointRotation.y,
				jointRotation.z, jointRotation.w, joint.relativeQuat.x, joint.relativeQuat.y, joint.relativeQuat.z, joint.relativeQuat.w);*/

			newParentToChild = (jointRotation * relativeQuat).getNormalized();

			const PxVec3 e = newParentToChild.rotate(parentOffset);
			const PxVec3 d = childOffset;
			r = e + d;

			assert(r.isFinite());

			break;
		}
		case PxArticulationJointType::eSPHERICAL:
		{
			PxQuat jointRotation(PxIdentity);

			PxVec3 ang(0.f);
			for (PxU32 d = 0; d < jointDatum.nbDof; ++d)
			{
				ang += motionMatrix[linkID][d].top * -jPosition[d];
			}
			PxReal angle = ang.normalize();

			jointRotation = angle < 1e-10f ? PxQuat(PxIdentity) : PxQuat(angle, ang);
			if(jointRotation.w < 0.f)
				jointRotation = -jointRotation;

			newParentToChild = (jointRotation * relativeQuat).getNormalized();

			const PxVec3 e = newParentToChild.rotate(parentOffset);
			const PxVec3 d = childOffset;
			r = e + d;
			break;
		}
		case PxArticulationJointType::eFIX:
		{
			//this is fix joint so joint don't have velocity
			newParentToChild = relativeQuat;

			const PxVec3 e = newParentToChild.rotate(parentOffset);
			const PxVec3 d = childOffset;

			r = e + d;
			break;
		}
		default:
			break;
		}
	}

	__syncwarp();

	if (threadIndexInWarp == 0)
	{
		for (PxU32 linkID = 1; linkID < linkCount; ++linkID)
		{
			const PxU32 parent = parents[linkID];

			const PxTransform pBody2World = body2Worlds[parent];

			PxQuat& newParentToChild = tempNewParentToChilds[linkID];
			PxVec3& r = tempRs[linkID];

			PxTransform& body2World = body2Worlds[linkID];

			body2World.q = (pBody2World.q * newParentToChild.getConjugate()).getNormalized();
			body2World.p = pBody2World.p + body2World.q.rotate(r);

            // we do NOT calculate updated link velocities if this is the automatic pre-sim updateKinematic call, because we will immediately do it again because we also need to clamp the joint velocities.
            // if this was a velocity-only update we already skip in the parent function, so we only end up here if positions are dirty.
            if (zeroSimOutput)
            {
                // link velocity update - unfortunately also dependent on parent position and velocity.
                Cm::UnAlignedSpatialVector parentVel = articulation.motionVelocities[parent];
                const PxVec3 c2p = body2World.p - pBody2World.p;
                Cm::UnAlignedSpatialVector linkVelocity = FeatherstoneArticulation::translateSpatialVector(-c2p, parentVel);

                // AD unfortunately this is more-or-less the same code as in computeLinkVelocities, minus the maxJointVel clamping.
                const ArticulationJointCoreData& jointDatum = jointData[linkID];
                const PxReal* jVelocity = &articulation.jointVelocities[jointDatum.jointOffset];
                for (PxU32 ind = 0; ind < jointDatum.nbDof; ++ind)
                {
                    const Cm::UnAlignedSpatialVector worldCol = motionMatrix[linkID][ind].rotate(body2World);
                    const PxReal jVel = jVelocity[ind];
                    linkVelocity += worldCol * jVel;
                }

                articulation.motionVelocities[linkID] = linkVelocity;
            }
		}
	}

	__syncwarp();

    if (zeroSimOutput) // AD: we could potentially ingest this into the link traversal++, but let's not get ahead of ourselves.
    {
        const PxU32 numLinks = articulation.data.numLinks;
        const PxU32 numDofs = articulation.data.numJointDofs;
        PxReal* PX_RESTRICT linkAccelData = reinterpret_cast<PxReal*>(articulation.motionAccelerations);
        PxReal* PX_RESTRICT linkIncomingJointForceData = reinterpret_cast<PxReal*>(articulation.linkIncomingJointForces);

        const PxU32 numRealsForSpatialVector = sizeof(Cm::UnAlignedSpatialVector) / 4;
		const PxU32 maxLinksReal = articulation.data.numLinks * numRealsForSpatialVector;
        const PxU32 linkCountReal = numLinks * numRealsForSpatialVector;

        for (PxU32 index = threadIndexInWarp; index < PxMax(numDofs, maxLinksReal); index += WARP_SIZE)
        {
            if (index < linkCountReal)
            {
                linkAccelData[index] = 0.f;
                linkIncomingJointForceData[index] = 0.f;
            }

            if (index < numDofs)
            {
                articulation.jointAccelerations[index] = 0.f;
            }
        }
    }

	if (numShapes == 0)
	{
		// guard against no actors have shapes in scene, in which case gRigidNodeIndices, etc. would have zero elements
		return;
	}

    __syncwarp();

	const PxTransform* PX_RESTRICT linkBody2Actors = articulation.linkBody2Actors;

	//each thread deals with a link
	for (PxU32 linkID = threadIndexInWarp; linkID < linkCount; linkID += WARP_SIZE)
	{
		const PxNodeIndex linkNodeIndex(bodySimIndex, linkID);
		const PxTransform body2World = body2Worlds[linkID];
		const PxTransform body2Actor = linkBody2Actors[linkID];

		//this will search for the first pos for the matched node index
		PxU32 pos = binarySearch<PxNodeIndex>(gRigidNodeIndices, numShapes, linkNodeIndex);

		// go backward through the sorted rigid node index array which has an entry for each
		// shape of the link, and update the shape if it belongs to the link, i.e. the rigid
		// node index matches the link node index
		while (pos != 0xFFffFFff && gRigidNodeIndices[pos] == linkNodeIndex)
		{
			const PxU32 shapeIndex = gShapeIndices[pos];
			if (shapeIndex != 0xFFffFFff)
			{
				const PxgShapeSim& shapeSim = gShapeSimPool[shapeIndex];

				const PxTransform absPos = getAbsPose(body2World, shapeSim.mTransform, body2Actor);

				//update broad phase bound, transform cache
				updateCacheAndBound(absPos, shapeSim, shapeIndex, gTransformCache, bounds, gConvexShapes, true);
			}
			pos--;
		}
	}
}

//This function is called after user update gpu buffer(Dy::ArticulationDirtyFlag::eDIRTY_ROOT || Dy::ArticulationDirtyFlag::eDIRTY_POSITIONS)
extern "C" __global__ void artiUpdateKinematic(
	const PxgArticulationCoreDesc* const PX_RESTRICT scDesc,
	const PxgShapeSim* PX_RESTRICT gShapeSimPool,
	const PxgShape* PX_RESTRICT gConvexShapes,
	PxsCachedTransform* PX_RESTRICT gTransformCache,
	const PxNodeIndex* PX_RESTRICT gRigidNodeIndices,
	const PxU32* PX_RESTRICT gShapeIndices,
	const PxU32 numShapes,
	PxBounds3* PX_RESTRICT bounds,
	const PxArticulationGPUIndex* PX_RESTRICT gpuIndices,	// NULL: process all the dirty articulations
	const PxU32 nbElements,						// can be 0 in combination with NULL index buffer.
    bool zeroSimOutput)
{
	// we launch blocks of 32x2 threads. 1 warp deals with 1 articulation.
	assert(blockDim.x == 32);
	assert(blockDim.y == 2);

	const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE - 1);
	const PxU32 globalWarpIndex = blockIdx.x * blockDim.y + threadIdx.y;
	const PxU32 warpsPerGrid = gridDim.x * blockDim.y;

	const PxU32 nbArticulations = gpuIndices ? nbElements : scDesc->nbArticulations;

	for (PxU32 i = globalWarpIndex; i < nbArticulations; i += warpsPerGrid)
	{
		const PxU32 articulationIndex = gpuIndices ? gpuIndices[i] : i;

		PxgArticulation& articulation = scDesc->articulations[articulationIndex];
		if (articulation.data.gpuDirtyFlag & ArticulationDirtyFlag::eNEEDS_KINEMATIC_UPDATE)
		{
			// reset while in cache.
			articulation.data.gpuDirtyFlag &= ~(ArticulationDirtyFlag::eNEEDS_KINEMATIC_UPDATE);

            // if we only have dirty velocities and this is the pre-sim automatic call, we can skip that articulation.
            // The link velocities will be updated in computeLinkVelocities anyway.
            PxU32 mask = (ArticulationDirtyFlag::eDIRTY_ROOT_TRANSFORM | ArticulationDirtyFlag::eDIRTY_POSITIONS);
            bool positionsDirty = articulation.data.gpuDirtyFlag & mask;

            if (!zeroSimOutput && !positionsDirty)
                continue;

            if (zeroSimOutput)
            {
                updateKinematicInternal<true>(articulation, threadIndexInWarp, gShapeSimPool, gConvexShapes,
                    gTransformCache, gRigidNodeIndices, gShapeIndices, numShapes, bounds);
            }
            else
            {
                updateKinematicInternal<false>(articulation, threadIndexInWarp, gShapeSimPool, gConvexShapes,
                    gTransformCache, gRigidNodeIndices, gShapeIndices, numShapes, bounds);
            }
		}
	}
}

extern "C" __global__ void getArtiDofStates(
	const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
	PxReal* PX_RESTRICT data,
	const PxArticulationGPUIndex* PX_RESTRICT index,
	const PxU32 nbElements,
	const PxU32 maxDofs,
	PxArticulationGPUAPIReadType::Enum type
)
{
	// 1 thread - 1 dof.
	// input has maxDofs * PxReal * nbArticulations size.

	const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
	const PxU32 groupIndex = globalThreadIndex / maxDofs;
	const PxU32 dofIndex = globalThreadIndex % maxDofs;

	if (groupIndex < nbElements)
	{
		const PxArticulationGPUIndex articulationIndex = index[groupIndex];
		const PxgArticulation& articulation = scDesc->articulations[articulationIndex];

		PxReal* PX_RESTRICT dstData = &data[groupIndex * maxDofs];
		const PxReal* PX_RESTRICT srcData;

		switch(type)
		{
			case PxArticulationGPUAPIReadType::eJOINT_POSITION:
			{
				srcData = articulation.jointPositions;
				break;
			}
			case PxArticulationGPUAPIReadType::eJOINT_VELOCITY:
			{
				srcData = articulation.jointVelocities;
				break;
			}
			case PxArticulationGPUAPIReadType::eJOINT_ACCELERATION:
			{
				srcData = articulation.jointAccelerations;
				break;
			}
			case PxArticulationGPUAPIReadType::eJOINT_FORCE:
			{
				srcData = articulation.jointForce;
				break;
			}
			case PxArticulationGPUAPIReadType::eJOINT_TARGET_VELOCITY:
			{
				srcData = articulation.jointTargetVelocities;
				break;
			}
			case PxArticulationGPUAPIReadType::eJOINT_TARGET_POSITION:
			{
				srcData = articulation.jointTargetPositions;
				break;
			}
			default:
				assert(0);
		}

		const PxU32 artiDofs = articulation.data.numJointDofs;
		if (dofIndex < artiDofs)
		{
			dstData[dofIndex] = srcData[dofIndex];
		}
	}
}

extern "C" __global__ void getArtiTransformStates(
	const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
	PxTransform* PX_RESTRICT data,
	const PxArticulationGPUIndex* PX_RESTRICT index,
	const PxU32 nbElements,
	const PxU32 maxLinks,
	PxArticulationGPUAPIReadType::Enum type
)
{
	// 1 thread - 1 link, 1 transform
	const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
	const PxU32 groupIndex = globalThreadIndex / maxLinks;
	const PxU32 linkIndex = globalThreadIndex % maxLinks;

	if (groupIndex < nbElements)
	{
		const PxArticulationGPUIndex articulationIndex = index[groupIndex];
		const PxgArticulation& articulation = scDesc->articulations[articulationIndex];

		PxU32 numLinks = (type == PxArticulationGPUAPIReadType::eROOT_GLOBAL_POSE) ? 1 : articulation.data.numLinks;

		PxTransform* dstData = &data[groupIndex * maxLinks];

		if (linkIndex < numLinks)
		{
			const PxTransform body2Actor = articulation.linkBody2Actors[linkIndex];
			const PxTransform body2World = articulation.linkBody2Worlds[linkIndex];
			dstData[linkIndex] = body2World * body2Actor.getInverse();
		}
	}
}

extern "C" __global__ void getArtiVelocityStates(
	const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
	PxVec3* PX_RESTRICT data,
	const PxArticulationGPUIndex* PX_RESTRICT index,
	const PxU32 nbElements,
	const PxU32 maxLinks,
	PxArticulationGPUAPIReadType::Enum type
)
{
	// 1 thread - 1 vec3 element.
	const PxU32 threadPerGroup = 3u;
	const PxU32 threadPerArticulation = threadPerGroup * maxLinks;

	const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
	const PxU32 groupIndex = globalThreadIndex / threadPerArticulation;

	const PxU32 localIndex = globalThreadIndex % threadPerArticulation;
	const PxU32 linkIndex = localIndex / threadPerGroup;
	const PxU32 elementIndex = localIndex % threadPerGroup;

	if (groupIndex < nbElements)
	{
		const PxArticulationGPUIndex articulationIndex = index[groupIndex];
		const PxgArticulation& articulation = scDesc->articulations[articulationIndex];

		PxU32 numLinks;
		const PxReal* PX_RESTRICT srcData;

		switch (type)
		{
			case PxArticulationGPUAPIReadType::eROOT_LINEAR_VELOCITY:
			{
				srcData = reinterpret_cast<const PxReal*>(articulation.motionVelocities) + 3;
				numLinks = 1;
				break;
			}
			case PxArticulationGPUAPIReadType::eROOT_ANGULAR_VELOCITY:
			{
				srcData = reinterpret_cast<const PxReal*>(articulation.motionVelocities);
				numLinks = 1;
				break;
			}
			case PxArticulationGPUAPIReadType::eLINK_LINEAR_VELOCITY:
			{
				srcData = reinterpret_cast<const PxReal*>(articulation.motionVelocities) + 3;
				numLinks = articulation.data.numLinks;
				break;
			}
			case PxArticulationGPUAPIReadType::eLINK_ANGULAR_VELOCITY:
			{
				srcData = reinterpret_cast<const PxReal*>(articulation.motionVelocities);
				numLinks = articulation.data.numLinks;
				break;
			}
			case PxArticulationGPUAPIReadType::eLINK_LINEAR_ACCELERATION:
			{
				srcData = reinterpret_cast<const PxReal*>(articulation.motionAccelerations) + 3;
				numLinks = articulation.data.numLinks;
				break;
			}
			case PxArticulationGPUAPIReadType::eLINK_ANGULAR_ACCELERATION:
			{
				srcData = reinterpret_cast<const PxReal*>(articulation.motionAccelerations);
				numLinks = articulation.data.numLinks;
				break;
			}
			default:
				assert(0);
		}

		if (linkIndex < numLinks)
		{
			PxReal* PX_RESTRICT dstData = reinterpret_cast<PxReal*>(&data[groupIndex * maxLinks + linkIndex]);
			const PxReal* PX_RESTRICT srcDataU = &srcData[linkIndex * threadPerGroup * 2]; // careful because source is Cm::UnAlignedSpatialVector

			if (elementIndex < threadPerGroup)
			{
				dstData[elementIndex] = srcDataU[elementIndex];
			}
		}
	}
}

extern "C" __global__ void getArtiSpatialForceStates(
	const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
	PxReal* PX_RESTRICT data,
	const PxArticulationGPUIndex* PX_RESTRICT gpuIndices,
	const PxU32 nbElements,
	const PxU32 maxLinks
)
{
	// 1 thread - 1 element of Cm::UnAlignedSpatialVector.

	// AD sizeof is probably evaluated at compile time, so just use sizeof for the constant?
	PX_COMPILE_TIME_ASSERT((sizeof(Cm::UnAlignedSpatialVector) / 4) == 6u);

	//we need 6 threads for the velocities - for each link
	const PxU32 threadPerGroup = 6u;
	const PxU32 threadPerArticulation = threadPerGroup * maxLinks;

	const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
	const PxU32 groupIndex = globalThreadIndex / threadPerArticulation;

	const PxU32 localIndex = globalThreadIndex % threadPerArticulation;
	const PxU32 linkIndex = localIndex / threadPerGroup;
	const PxU32 elementIndex = localIndex % threadPerGroup;

	if (groupIndex < nbElements)
	{
		const PxArticulationGPUIndex articulationIndex = gpuIndices[groupIndex];
		const PxgArticulation& articulation = scDesc->articulations[articulationIndex];

		const Cm::UnAlignedSpatialVector* PX_RESTRICT srcData = articulation.linkIncomingJointForces;
		const PxU32 numLinks = articulation.data.numLinks;

		if (linkIndex < numLinks)
		{
			// 6 PxReal per link
			PxReal* PX_RESTRICT dstData = reinterpret_cast<PxReal*>(&data[groupIndex * threadPerArticulation + linkIndex * threadPerGroup]);
			const PxReal* PX_RESTRICT srcDataU = reinterpret_cast<const PxReal*>(&srcData[linkIndex]);

			if (elementIndex < 6u)
			{
				dstData[elementIndex] = srcDataU[elementIndex];
			}
		}
	}
}

extern "C" __global__ void setArtiDofStates(
	const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
	const PxReal* PX_RESTRICT data,
	const PxArticulationGPUIndex* PX_RESTRICT gpuIndices,
	const PxU32 nbElements,
	const PxU32 maxDofs,
	PxArticulationGPUAPIWriteType::Enum type
)
{
	// 1 thread - 1 dof.
	// input has maxDofs * PxReal * nbElements size.

	const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
	const PxU32 groupIndex = globalThreadIndex / maxDofs;
	const PxU32 dofIndex = globalThreadIndex % maxDofs;

	if (groupIndex < nbElements)
	{
		const PxArticulationGPUIndex articulationIndex = gpuIndices[groupIndex];
		PxgArticulation& articulation = scDesc->articulations[articulationIndex];

		PxReal* PX_RESTRICT dstData;

		switch (type)
		{
			case (PxArticulationGPUAPIWriteType::eJOINT_POSITION):
			{
				dstData = articulation.jointPositions;
				if (dofIndex == 0)
					articulation.data.gpuDirtyFlag |= (Dy::ArticulationDirtyFlag::eDIRTY_POSITIONS | Dy::ArticulationDirtyFlag::eNEEDS_KINEMATIC_UPDATE);
				break;
			}
			case (PxArticulationGPUAPIWriteType::eJOINT_VELOCITY):
			{
				dstData = articulation.jointVelocities;
				if (dofIndex == 0)
					articulation.data.gpuDirtyFlag |= (Dy::ArticulationDirtyFlag::eDIRTY_VELOCITIES | Dy::ArticulationDirtyFlag::eNEEDS_KINEMATIC_UPDATE);
				break;
			}
			case (PxArticulationGPUAPIWriteType::eJOINT_FORCE):
			{
				dstData = articulation.jointForce;
				if (dofIndex == 0)
					articulation.data.gpuDirtyFlag |= Dy::ArticulationDirtyFlag::eDIRTY_FORCES;
				break;
			}
			case (PxArticulationGPUAPIWriteType::eJOINT_TARGET_POSITION):
			{
				dstData = articulation.jointTargetPositions;
				if (dofIndex == 0)
					articulation.data.gpuDirtyFlag |= Dy::ArticulationDirtyFlag::eDIRTY_JOINT_TARGET_POS;
				break;
			}
			case (PxArticulationGPUAPIWriteType::eJOINT_TARGET_VELOCITY):
			{
				dstData = articulation.jointTargetVelocities;
				if (dofIndex == 0)
					articulation.data.gpuDirtyFlag |= Dy::ArticulationDirtyFlag::eDIRTY_JOINT_TARGET_VEL;
				break;
			}
			default:
				assert(false);
		}

		const PxU32 artiDofs = articulation.data.numJointDofs;

		const PxReal* PX_RESTRICT srcData = &data[groupIndex * maxDofs];

		if (dofIndex < artiDofs)
		{
			dstData[dofIndex] = srcData[dofIndex];
		}
	}
}

extern "C" __global__ void setArtiRootGlobalPoseState(
	const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
	const PxTransform* PX_RESTRICT data,
	const PxArticulationGPUIndex* PX_RESTRICT index,
	const PxU32 nbElements
)
{
	// 1 thread - 1 transform value.
	// input has PxTransform * nbArticulations size.

	const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;

	// currently 1 thread does the new calc.
	if (globalThreadIndex < nbElements)
	{
		const PxArticulationGPUIndex articulationIndex = index[globalThreadIndex];
		PxgArticulation& articulation = scDesc->articulations[articulationIndex];

		articulation.data.gpuDirtyFlag |= (Dy::ArticulationDirtyFlag::eDIRTY_ROOT_TRANSFORM | Dy::ArticulationDirtyFlag::eNEEDS_KINEMATIC_UPDATE);

		const PxTransform actorPose = data[globalThreadIndex];
		const PxTransform body2Actor = articulation.linkBody2Actors[0]; // TODO AD: this could be outdated - how do we resolve this?

		const PxTransform pose = actorPose * body2Actor;

		articulation.linkBody2Worlds[0] = pose;
	}
}

extern "C" __global__ void setArtiRootVelocityState(
	const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
	const PxVec3* PX_RESTRICT data,
	const PxArticulationGPUIndex* PX_RESTRICT index,
	const PxU32 nbElements,
	const PxArticulationGPUAPIWriteType::Enum operation
)
{
	// 1 thread - 1 vec3 element.
	const PxU32 threadPerArticulation = 3u;

	const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
	const PxU32 artiIndex = globalThreadIndex / threadPerArticulation;

	const PxU32 localIndex = globalThreadIndex % threadPerArticulation;

	if (artiIndex < nbElements)
	{
		const PxArticulationGPUIndex articulationIndex = index[artiIndex];
		PxgArticulation& articulation = scDesc->articulations[articulationIndex];

		if(localIndex == 0)
			articulation.data.gpuDirtyFlag |= (Dy::ArticulationDirtyFlag::eDIRTY_ROOT_VELOCITIES | Dy::ArticulationDirtyFlag::eNEEDS_KINEMATIC_UPDATE);

		PxReal* dstData;

		switch (operation)
		{
			case PxArticulationGPUAPIWriteType::eROOT_LINEAR_VELOCITY:
			{
				dstData = reinterpret_cast<PxReal*>(articulation.motionVelocities) + 3;
				break;
			}
			case PxArticulationGPUAPIWriteType::eROOT_ANGULAR_VELOCITY:
			{
				dstData = reinterpret_cast<PxReal*>(articulation.motionVelocities);
				break;
			}
			default:
				assert(0);
		}

		const PxReal* PX_RESTRICT srcData = reinterpret_cast<const PxReal*>(&data[artiIndex]);

		if (localIndex < threadPerArticulation)
		{
			dstData[localIndex] = srcData[localIndex];
		}
	}
}

extern "C" __global__ void setArtiLinkForceState(
	const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
	const PxVec3* PX_RESTRICT data,
	const PxArticulationGPUIndex* PX_RESTRICT index,
	const PxU32 nbElements,
	const PxU32 maxLinks
)
{
	// 1 thread - 1 float element
	const PxU32 threadPerGroup = 3u; // PxVec3
	const PxU32 threadPerArticulation = threadPerGroup * maxLinks;

	const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
	const PxU32 artiSourceIndex = globalThreadIndex / threadPerArticulation;

	const PxU32 localIndex = globalThreadIndex % threadPerArticulation;
	const PxU32 linkIndex = localIndex / threadPerGroup;
	const PxU32 elementIndex = localIndex % threadPerGroup;

	if (artiSourceIndex < nbElements)
	{
		const PxArticulationGPUIndex articulationIndex = index[artiSourceIndex];
		const PxgArticulation& articulation = scDesc->articulations[articulationIndex];

		const PxgArticulationLinkProp* const PX_RESTRICT linkProps = articulation.linkProps;
		Cm::UnAlignedSpatialVector* externalAccel = articulation.externalAccelerations;

		const PxU32 artiNumLinks = articulation.data.numLinks;
		const PxVec3* PX_RESTRICT srcData = &data[artiSourceIndex * maxLinks];

		if (linkIndex < artiNumLinks)
		{
			if (elementIndex < threadPerGroup)
			{
				const float4 invInertiaXYZ_invMass = linkProps[linkIndex].invInertiaXYZ_invMass;

				const PxReal* forces = reinterpret_cast<const PxReal*>(&srcData[linkIndex]);
				PxReal* dst = reinterpret_cast<PxReal*>(&externalAccel[linkIndex]);

				dst[elementIndex] = forces[elementIndex] * invInertiaXYZ_invMass.w;
			}
		}
	}
}

extern "C" __global__ void setArtiLinkTorqueState(
	const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
	const PxVec3* PX_RESTRICT data,
	const PxArticulationGPUIndex* PX_RESTRICT gpuIndices,
	const PxU32 nbElements,
	const PxU32 maxLinks
)
{
	// 1 thread - 1 link.
	const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
	const PxU32 groupIndex = globalThreadIndex / maxLinks;
	const PxU32 linkIndex = globalThreadIndex % maxLinks;

	if (groupIndex < nbElements)
	{
		const PxArticulationGPUIndex articulationIndex = gpuIndices[groupIndex];
		PxgArticulation& articulation = scDesc->articulations[articulationIndex];

		const PxgArticulationLinkProp* const PX_RESTRICT linkProps = articulation.linkProps;
		Cm::UnAlignedSpatialVector* externalAccel = articulation.externalAccelerations;
		const PxTransform* PX_RESTRICT body2Worlds = articulation.linkBody2Worlds;

		const PxVec3* PX_RESTRICT srcData = &data[groupIndex * maxLinks];

		const PxU32 artiNumLinks = articulation.data.numLinks;
		if (linkIndex < artiNumLinks)
		{
			const PxQuat& q = body2Worlds[linkIndex].q;
			const PxVec3& linkTorque = srcData[linkIndex];
			const PxVec3 localLinkTorque = q.rotateInv(linkTorque);

			//turn localLinkTorque into acceleration in local frame
			const PxVec3 invInertia = PxLoad3(linkProps[linkIndex].invInertiaXYZ_invMass);
			const PxVec3 localAccel = invInertia.multiply(localLinkTorque);
			//turn the localAccel into world space
			const PxVec3 worldAccel = q.rotate(localAccel);

			externalAccel[linkIndex].bottom = PxVec3(worldAccel.x, worldAccel.y, worldAccel.z);
		}
	}
}

// AD: might make sense to split this to more threads!
extern "C" __global__ void setArtiTendonState(
	const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
	const void* PX_RESTRICT data,
	const PxArticulationGPUIndex* PX_RESTRICT gpuIndices,
	const PxU32 nbElements,
	const PxU32 maxTendons,
	const PxArticulationGPUAPIWriteType::Enum operation
)
{
	const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
	const PxU32 groupIndex = globalThreadIndex / maxTendons;
	const PxU32 tendonIndex = globalThreadIndex % maxTendons;

	if (groupIndex < nbElements)
	{
		const PxArticulationGPUIndex articulationIndex = gpuIndices[groupIndex];
		PxgArticulation& articulation = scDesc->articulations[articulationIndex];

		switch (operation)
		{
			case (PxArticulationGPUAPIWriteType::eSPATIAL_TENDON):
			{
				const PxU32 numSpatialTendons = articulation.data.numSpatialTendons;
				if (tendonIndex < numSpatialTendons)
				{
					const PxGpuSpatialTendonData* srcData = &(reinterpret_cast<const PxGpuSpatialTendonData*>(data)[groupIndex * maxTendons]);
					articulation.spatialTendonParams[tendonIndex] = srcData[tendonIndex];
				}
				if (tendonIndex == 0)
					articulation.data.gpuDirtyFlag |= Dy::ArticulationDirtyFlag::eDIRTY_SPATIAL_TENDON;
				break;
			}
			case (PxArticulationGPUAPIWriteType::eFIXED_TENDON):
			{
				const PxU32 numFixedTendons = articulation.data.numFixedTendons;
				if (tendonIndex < numFixedTendons)
				{
					const PxGpuFixedTendonData* srcData = &(reinterpret_cast<const PxGpuFixedTendonData*>(data)[groupIndex * maxTendons]);
					articulation.fixedTendonParams[tendonIndex] = srcData[tendonIndex];
				}
				if (tendonIndex == 0)
					articulation.data.gpuDirtyFlag |= Dy::ArticulationDirtyFlag::eDIRTY_FIXED_TENDON;
				break;
			}
			default:
				assert(false);
		}
	}
}

extern "C" __global__ void getArtiTendonState(
	const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
	void* PX_RESTRICT data,
	const PxArticulationGPUIndex* PX_RESTRICT gpuIndices,
	const PxU32 nbElements,
	const PxU32 maxTendons,
	const PxArticulationGPUAPIReadType::Enum operation
)
{
	const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
	const PxU32 groupIndex = globalThreadIndex / maxTendons;
	const PxU32 tendonIndex = globalThreadIndex % maxTendons;

	if (groupIndex < nbElements)
	{
		const PxArticulationGPUIndex articulationIndex = gpuIndices[groupIndex];
		const PxgArticulation& articulation = scDesc->articulations[articulationIndex];

		switch (operation)
		{
			case (PxArticulationGPUAPIReadType::eSPATIAL_TENDON):
			{
				const PxU32 numSpatialTendons = articulation.data.numSpatialTendons;
				if (tendonIndex < numSpatialTendons)
				{
					PxGpuSpatialTendonData* dstData = &(reinterpret_cast<PxGpuSpatialTendonData*>(data)[groupIndex * maxTendons]);
					dstData[tendonIndex] = articulation.spatialTendonParams[tendonIndex];
				}
				break;
			}
			case (PxArticulationGPUAPIReadType::eFIXED_TENDON):
			{
				const PxU32 numFixedTendons = articulation.data.numFixedTendons;
				if (tendonIndex < numFixedTendons)
				{
					PxGpuFixedTendonData* dstData = &(reinterpret_cast<PxGpuFixedTendonData*>(data)[groupIndex * maxTendons]);
					dstData[tendonIndex] = articulation.fixedTendonParams[tendonIndex];
				}
				break;
			}
			default:
				assert(false);
		}
	}
}

extern "C" __global__ void setArtiSpatialTendonAttachmentState(
	const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
	const PxGpuTendonAttachmentData* PX_RESTRICT data,
	const PxArticulationGPUIndex* PX_RESTRICT gpuIndices,
	const PxU32 nbElements,
	const PxU32 maxTendons
)
{
	const PxU32 maxTendonAttachments = scDesc->mMaxAttachmentPerArticulation;
	const PxU32 maxSpatialTendons = scDesc->mMaxSpatialTendonsPerArticulation;

	const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
	const PxU32 groupIndex = globalThreadIndex / maxTendons;
	const PxU32 elementIndex = globalThreadIndex % maxTendons;
	const PxU32 tendonIndex = elementIndex / maxTendonAttachments;
	const PxU32 attachmentIndex = elementIndex % maxTendonAttachments;

	if (groupIndex < nbElements)
	{
		const PxArticulationGPUIndex articulationIndex = gpuIndices[groupIndex];
		PxgArticulation& articulation = scDesc->articulations[articulationIndex];

		if (elementIndex == 0)
			articulation.data.gpuDirtyFlag |= Dy::ArticulationDirtyFlag::eDIRTY_SPATIAL_TENDON_ATTACHMENT;

		const PxU32 numSpatialTendons = articulation.data.numSpatialTendons;

		if (tendonIndex < numSpatialTendons)
		{
			PxgArticulationTendon& tendon = articulation.spatialTendons[tendonIndex];

			const PxGpuTendonAttachmentData* srcData = &data[groupIndex * maxSpatialTendons * maxTendonAttachments];
			PxGpuTendonAttachmentData* attachData = reinterpret_cast<PxGpuTendonAttachmentData*>(tendon.mModElements);
			const PxU32 numTendonAttachments = tendon.mNbElements;

			if (attachmentIndex < numTendonAttachments)
			{
				//PxGpuTendonAttachmentData is 32 byte
				PX_COMPILE_TIME_ASSERT(sizeof(PxGpuTendonAttachmentData) == 32);
				const PxU32 numIteration = sizeof(PxGpuTendonAttachmentData) / sizeof(uint4);
				const uint4* tData = reinterpret_cast<const uint4*>(&srcData[attachmentIndex]);
				uint4* aData = reinterpret_cast<uint4*>(&attachData[attachmentIndex]);
				for (PxU32 i = 0; i < numIteration; ++i)
				{
					aData[i] = tData[i];
				}
			}
		}
	}
}

extern "C" __global__ void getArtiSpatialTendonAttachmentState(
	const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
	PxGpuTendonAttachmentData* PX_RESTRICT data,
	const PxArticulationGPUIndex* PX_RESTRICT gpuIndices,
	const PxU32 nbElements,
	const PxU32 maxTendons
)
{
	const PxU32 maxTendonAttachments = scDesc->mMaxAttachmentPerArticulation;
	const PxU32 maxSpatialTendons = scDesc->mMaxSpatialTendonsPerArticulation;

	const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
	const PxU32 groupIndex = globalThreadIndex / maxTendons;
	const PxU32 elementIndex = globalThreadIndex % maxTendons;
	const PxU32 tendonIndex = elementIndex / maxTendonAttachments;
	const PxU32 attachmentIndex = elementIndex % maxTendonAttachments;

	if (groupIndex < nbElements)
	{
		const PxArticulationGPUIndex articulationIndex = gpuIndices[groupIndex];
		const PxgArticulation& articulation = scDesc->articulations[articulationIndex];

		const PxU32 numSpatialTendons = articulation.data.numSpatialTendons;

		if (tendonIndex < numSpatialTendons)
		{
			const PxgArticulationTendon& tendon = articulation.spatialTendons[tendonIndex];

			PxGpuTendonAttachmentData* dstData = &data[groupIndex * maxSpatialTendons * maxTendonAttachments];
			const PxGpuTendonAttachmentData* attachData = reinterpret_cast<const PxGpuTendonAttachmentData*>(tendon.mModElements);
			const PxU32 numTendonAttachments = tendon.mNbElements;

			if (attachmentIndex < numTendonAttachments)
			{
				//PxGpuTendonAttachmentData is 32 byte
				PX_COMPILE_TIME_ASSERT(sizeof(PxGpuTendonAttachmentData) == 32);
				const PxU32 numIteration = sizeof(PxGpuTendonAttachmentData) / sizeof(uint4);
				uint4* tData = reinterpret_cast<uint4*>(&dstData[attachmentIndex]);
				const uint4* aData = reinterpret_cast<const uint4*>(&attachData[attachmentIndex]);
				for (PxU32 i = 0; i < numIteration; ++i)
				{
					tData[i] = aData[i];
				}
			}
		}
	}
}

extern "C" __global__ void setArtiFixedTendonJointState(
	const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
	const PxGpuTendonJointCoefficientData* PX_RESTRICT data,
	const PxArticulationGPUIndex* PX_RESTRICT gpuIndices,
	const PxU32 nbElements
)
{
	const PxU32 maxFixedTendons = scDesc->mMaxFixedTendonsPerArticulation;
	const PxU32 maxTendonJoints = scDesc->mMaxTendonJointPerArticulation;
	const PxU32 maxTendons = maxFixedTendons * maxTendonJoints;

	const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
	const PxU32 groupIndex = globalThreadIndex / maxTendons;
	const PxU32 elementIndex = globalThreadIndex % maxTendons;
	const PxU32 tendonIndex = elementIndex / maxTendonJoints;
	const PxU32 tendonJointIndex = elementIndex % maxTendonJoints;

	if (groupIndex < nbElements)
	{
		const PxArticulationGPUIndex articulationIndex = gpuIndices[groupIndex];
		PxgArticulation& articulation = scDesc->articulations[articulationIndex];

		if (elementIndex == 0)
			articulation.data.gpuDirtyFlag |= Dy::ArticulationDirtyFlag::eDIRTY_FIXED_TENDON_JOINT;

		PxgArticulationTendon* fixedTendons = articulation.fixedTendons;
		const PxU32 numFixedTendonJoints = articulation.data.numFixedTendons;

		if (tendonIndex < numFixedTendonJoints)
		{
			PxgArticulationTendon& tendon = fixedTendons[tendonIndex];

			const PxGpuTendonJointCoefficientData* srcData = &data[groupIndex * maxFixedTendons * maxTendonJoints];

			PxGpuTendonJointCoefficientData* coefficientData = reinterpret_cast<PxGpuTendonJointCoefficientData*>(tendon.mModElements);
			const PxU32 numTendonJoints = tendon.mNbElements;

			if (tendonJointIndex < numTendonJoints)
			{
				//PxGpuTendonJointCoefficientData is 16 byte
				PX_COMPILE_TIME_ASSERT(sizeof(PxGpuTendonJointCoefficientData) == 16);
				const uint4 tData = reinterpret_cast<const uint4&>(srcData[tendonJointIndex]);
				uint4& coefData = reinterpret_cast<uint4&>(coefficientData[tendonJointIndex]);
				coefData = tData;
			}
		}
	}
}

extern "C" __global__ void getArtiFixedTendonJointState(
	const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
	PxGpuTendonJointCoefficientData* PX_RESTRICT data,
	const PxArticulationGPUIndex* PX_RESTRICT gpuIndices,
	const PxU32 nbElements
)
{
	const PxU32 maxFixedTendons = scDesc->mMaxFixedTendonsPerArticulation;
	const PxU32 maxTendonJoints = scDesc->mMaxTendonJointPerArticulation;
	const PxU32 maxTendons = maxFixedTendons * maxTendonJoints;

	const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
	const PxU32 groupIndex = globalThreadIndex / maxTendons;
	const PxU32 elementIndex = globalThreadIndex % maxTendons;
	const PxU32 tendonIndex = elementIndex / maxTendonJoints;
	const PxU32 tendonJointIndex = elementIndex % maxTendonJoints;

	if (groupIndex < nbElements)
	{
		const PxArticulationGPUIndex articulationIndex = gpuIndices[groupIndex];
		const PxgArticulation& articulation = scDesc->articulations[articulationIndex];

		const PxgArticulationTendon* fixedTendons = articulation.fixedTendons;
		const PxU32 numFixedTendonJoints = articulation.data.numFixedTendons;

		if (tendonIndex < numFixedTendonJoints)
		{
			const PxgArticulationTendon& tendon = fixedTendons[tendonIndex];

			PxGpuTendonJointCoefficientData* dstData = &data[groupIndex * maxFixedTendons * maxTendonJoints];

			const PxGpuTendonJointCoefficientData* coefficientData = reinterpret_cast<const PxGpuTendonJointCoefficientData*>(tendon.mModElements);
			const PxU32 numTendonJoints = tendon.mNbElements;

			if (tendonJointIndex < numTendonJoints)
			{
				//PxGpuTendonJointCoefficientData is 16 byte
				PX_COMPILE_TIME_ASSERT(sizeof(PxGpuTendonJointCoefficientData) == 16);
				uint4& tData = reinterpret_cast<uint4&>(dstData[tendonJointIndex]);
				const uint4& coefData = reinterpret_cast<const uint4&>(coefficientData[tendonJointIndex]);
				tData = coefData;
			}
		}
	}
}