Files
XCEngine/engine/third_party/physx/source/gpuarticulation/src/CUDA/articulationDirectGpuApi.cu

1111 lines
39 KiB
Plaintext

// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#include "PxDirectGPUAPI.h"
#include "foundation/PxSimpleTypes.h"
#include "foundation/PxVec3.h"
#include "foundation/PxTransform.h"
#include "foundation/PxMath.h"
#include "PxArticulationFlag.h"
#include "PxArticulationTendonData.h"
#include "PxgArticulation.h"
#include "PxgArticulationLink.h"
#include "PxgArticulationCoreDesc.h"
#include "PxgShapeSim.h"
#include "PxsTransformCache.h"
#include "DyArticulationCore.h"
#include "DyArticulationJointCore.h"
#include "DyFeatherstoneArticulation.h"
#include "DyFeatherstoneArticulationUtils.h"
#include "DyFeatherstoneArticulationJointData.h"
#include "CmSpatialVector.h"
#include "utils.cuh"
#include "reduction.cuh"
#include "updateCacheAndBound.cuh"
#include <assert.h>
using namespace physx;
using namespace Dy;
extern "C" __host__ void initArticulationKernels3() {}
template<bool zeroSimOutput>
static PX_FORCE_INLINE __device__ void updateKinematicInternal(
const PxgArticulation& articulation,
const PxU32 threadIndexInWarp,
const PxgShapeSim* PX_RESTRICT gShapeSimPool,
const PxgShape* PX_RESTRICT gConvexShapes,
PxsCachedTransform* PX_RESTRICT gTransformCache,
const PxNodeIndex* PX_RESTRICT gRigidNodeIndices,//sorted rigid body node index
const PxU32* PX_RESTRICT gShapeIndices,//the corresponding shape index
const PxU32 numShapes,
PxBounds3* PX_RESTRICT bounds)
{
const PxgArticulationData artiData = articulation.data;
const PxU32 linkCount = artiData.numLinks;
const PxU32 bodySimIndex = artiData.bodySimIndex;
PxQuat* tempNewParentToChilds = articulation.tempParentToChilds;
PxVec3* tempRs = articulation.tempRs;
PxTransform* body2Worlds = articulation.linkBody2Worlds;
ArticulationJointCore* joints = articulation.joints;
ArticulationJointCoreData* jointData = articulation.jointData;
const PxQuat* const PX_RESTRICT relativeQuats = articulation.relativeQuat;
const PxU32* const PX_RESTRICT parents = articulation.parents;
const SpatialSubspaceMatrix* const PX_RESTRICT motionMatrix = articulation.motionMatrix;
//each thread deals with a joint
for (PxU32 linkID = 1 + threadIndexInWarp; linkID < linkCount; linkID += WARP_SIZE)
{
const ArticulationJointCoreData& jointDatum = jointData[linkID];
const ArticulationJointCore& joint = joints[linkID];
const PxReal* jPosition = &articulation.jointPositions[jointDatum.jointOffset];
PxQuat& newParentToChild = tempNewParentToChilds[linkID];
PxVec3& r = tempRs[linkID];
const PxVec3 childOffset = -joint.childPose.p;
const PxVec3 parentOffset = joint.parentPose.p;
const PxQuat relativeQuat = relativeQuats[linkID];
PxVec3 e, d;
switch (joint.jointType)
{
case PxArticulationJointType::ePRISMATIC:
{
newParentToChild = relativeQuat;
const PxVec3& u = motionMatrix[linkID][0].bottom;
const PxVec3 e = newParentToChild.rotate(parentOffset);
const PxVec3 d = childOffset;
r = e + d + u * jPosition[0];
break;
}
case PxArticulationJointType::eREVOLUTE:
case PxArticulationJointType::eREVOLUTE_UNWRAPPED:
{
const PxVec3& u = motionMatrix[linkID][0].top;
PxQuat jointRotation = PxQuat(-jPosition[0], u);
if (jointRotation.w < 0) //shortest angle.
jointRotation = -jointRotation;
/*printf("LinkID %i: jointRotation = (%f, %f, %f, %f), joint->relativeQuat = (%f, %f, %f,%f)\n", linkID, jointRotation.x, jointRotation.y,
jointRotation.z, jointRotation.w, joint.relativeQuat.x, joint.relativeQuat.y, joint.relativeQuat.z, joint.relativeQuat.w);*/
newParentToChild = (jointRotation * relativeQuat).getNormalized();
const PxVec3 e = newParentToChild.rotate(parentOffset);
const PxVec3 d = childOffset;
r = e + d;
assert(r.isFinite());
break;
}
case PxArticulationJointType::eSPHERICAL:
{
PxQuat jointRotation(PxIdentity);
PxVec3 ang(0.f);
for (PxU32 d = 0; d < jointDatum.nbDof; ++d)
{
ang += motionMatrix[linkID][d].top * -jPosition[d];
}
PxReal angle = ang.normalize();
jointRotation = angle < 1e-10f ? PxQuat(PxIdentity) : PxQuat(angle, ang);
if(jointRotation.w < 0.f)
jointRotation = -jointRotation;
newParentToChild = (jointRotation * relativeQuat).getNormalized();
const PxVec3 e = newParentToChild.rotate(parentOffset);
const PxVec3 d = childOffset;
r = e + d;
break;
}
case PxArticulationJointType::eFIX:
{
//this is fix joint so joint don't have velocity
newParentToChild = relativeQuat;
const PxVec3 e = newParentToChild.rotate(parentOffset);
const PxVec3 d = childOffset;
r = e + d;
break;
}
default:
break;
}
}
__syncwarp();
if (threadIndexInWarp == 0)
{
for (PxU32 linkID = 1; linkID < linkCount; ++linkID)
{
const PxU32 parent = parents[linkID];
const PxTransform pBody2World = body2Worlds[parent];
PxQuat& newParentToChild = tempNewParentToChilds[linkID];
PxVec3& r = tempRs[linkID];
PxTransform& body2World = body2Worlds[linkID];
body2World.q = (pBody2World.q * newParentToChild.getConjugate()).getNormalized();
body2World.p = pBody2World.p + body2World.q.rotate(r);
// we do NOT calculate updated link velocities if this is the automatic pre-sim updateKinematic call, because we will immediately do it again because we also need to clamp the joint velocities.
// if this was a velocity-only update we already skip in the parent function, so we only end up here if positions are dirty.
if (zeroSimOutput)
{
// link velocity update - unfortunately also dependent on parent position and velocity.
Cm::UnAlignedSpatialVector parentVel = articulation.motionVelocities[parent];
const PxVec3 c2p = body2World.p - pBody2World.p;
Cm::UnAlignedSpatialVector linkVelocity = FeatherstoneArticulation::translateSpatialVector(-c2p, parentVel);
// AD unfortunately this is more-or-less the same code as in computeLinkVelocities, minus the maxJointVel clamping.
const ArticulationJointCoreData& jointDatum = jointData[linkID];
const PxReal* jVelocity = &articulation.jointVelocities[jointDatum.jointOffset];
for (PxU32 ind = 0; ind < jointDatum.nbDof; ++ind)
{
const Cm::UnAlignedSpatialVector worldCol = motionMatrix[linkID][ind].rotate(body2World);
const PxReal jVel = jVelocity[ind];
linkVelocity += worldCol * jVel;
}
articulation.motionVelocities[linkID] = linkVelocity;
}
}
}
__syncwarp();
if (zeroSimOutput) // AD: we could potentially ingest this into the link traversal++, but let's not get ahead of ourselves.
{
const PxU32 numLinks = articulation.data.numLinks;
const PxU32 numDofs = articulation.data.numJointDofs;
PxReal* PX_RESTRICT linkAccelData = reinterpret_cast<PxReal*>(articulation.motionAccelerations);
PxReal* PX_RESTRICT linkIncomingJointForceData = reinterpret_cast<PxReal*>(articulation.linkIncomingJointForces);
const PxU32 numRealsForSpatialVector = sizeof(Cm::UnAlignedSpatialVector) / 4;
const PxU32 maxLinksReal = articulation.data.numLinks * numRealsForSpatialVector;
const PxU32 linkCountReal = numLinks * numRealsForSpatialVector;
for (PxU32 index = threadIndexInWarp; index < PxMax(numDofs, maxLinksReal); index += WARP_SIZE)
{
if (index < linkCountReal)
{
linkAccelData[index] = 0.f;
linkIncomingJointForceData[index] = 0.f;
}
if (index < numDofs)
{
articulation.jointAccelerations[index] = 0.f;
}
}
}
if (numShapes == 0)
{
// guard against no actors have shapes in scene, in which case gRigidNodeIndices, etc. would have zero elements
return;
}
__syncwarp();
const PxTransform* PX_RESTRICT linkBody2Actors = articulation.linkBody2Actors;
//each thread deals with a link
for (PxU32 linkID = threadIndexInWarp; linkID < linkCount; linkID += WARP_SIZE)
{
const PxNodeIndex linkNodeIndex(bodySimIndex, linkID);
const PxTransform body2World = body2Worlds[linkID];
const PxTransform body2Actor = linkBody2Actors[linkID];
//this will search for the first pos for the matched node index
PxU32 pos = binarySearch<PxNodeIndex>(gRigidNodeIndices, numShapes, linkNodeIndex);
// go backward through the sorted rigid node index array which has an entry for each
// shape of the link, and update the shape if it belongs to the link, i.e. the rigid
// node index matches the link node index
while (pos != 0xFFffFFff && gRigidNodeIndices[pos] == linkNodeIndex)
{
const PxU32 shapeIndex = gShapeIndices[pos];
if (shapeIndex != 0xFFffFFff)
{
const PxgShapeSim& shapeSim = gShapeSimPool[shapeIndex];
const PxTransform absPos = getAbsPose(body2World, shapeSim.mTransform, body2Actor);
//update broad phase bound, transform cache
updateCacheAndBound(absPos, shapeSim, shapeIndex, gTransformCache, bounds, gConvexShapes, true);
}
pos--;
}
}
}
//This function is called after user update gpu buffer(Dy::ArticulationDirtyFlag::eDIRTY_ROOT || Dy::ArticulationDirtyFlag::eDIRTY_POSITIONS)
extern "C" __global__ void artiUpdateKinematic(
const PxgArticulationCoreDesc* const PX_RESTRICT scDesc,
const PxgShapeSim* PX_RESTRICT gShapeSimPool,
const PxgShape* PX_RESTRICT gConvexShapes,
PxsCachedTransform* PX_RESTRICT gTransformCache,
const PxNodeIndex* PX_RESTRICT gRigidNodeIndices,
const PxU32* PX_RESTRICT gShapeIndices,
const PxU32 numShapes,
PxBounds3* PX_RESTRICT bounds,
const PxArticulationGPUIndex* PX_RESTRICT gpuIndices, // NULL: process all the dirty articulations
const PxU32 nbElements, // can be 0 in combination with NULL index buffer.
bool zeroSimOutput)
{
// we launch blocks of 32x2 threads. 1 warp deals with 1 articulation.
assert(blockDim.x == 32);
assert(blockDim.y == 2);
const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE - 1);
const PxU32 globalWarpIndex = blockIdx.x * blockDim.y + threadIdx.y;
const PxU32 warpsPerGrid = gridDim.x * blockDim.y;
const PxU32 nbArticulations = gpuIndices ? nbElements : scDesc->nbArticulations;
for (PxU32 i = globalWarpIndex; i < nbArticulations; i += warpsPerGrid)
{
const PxU32 articulationIndex = gpuIndices ? gpuIndices[i] : i;
PxgArticulation& articulation = scDesc->articulations[articulationIndex];
if (articulation.data.gpuDirtyFlag & ArticulationDirtyFlag::eNEEDS_KINEMATIC_UPDATE)
{
// reset while in cache.
articulation.data.gpuDirtyFlag &= ~(ArticulationDirtyFlag::eNEEDS_KINEMATIC_UPDATE);
// if we only have dirty velocities and this is the pre-sim automatic call, we can skip that articulation.
// The link velocities will be updated in computeLinkVelocities anyway.
PxU32 mask = (ArticulationDirtyFlag::eDIRTY_ROOT_TRANSFORM | ArticulationDirtyFlag::eDIRTY_POSITIONS);
bool positionsDirty = articulation.data.gpuDirtyFlag & mask;
if (!zeroSimOutput && !positionsDirty)
continue;
if (zeroSimOutput)
{
updateKinematicInternal<true>(articulation, threadIndexInWarp, gShapeSimPool, gConvexShapes,
gTransformCache, gRigidNodeIndices, gShapeIndices, numShapes, bounds);
}
else
{
updateKinematicInternal<false>(articulation, threadIndexInWarp, gShapeSimPool, gConvexShapes,
gTransformCache, gRigidNodeIndices, gShapeIndices, numShapes, bounds);
}
}
}
}
extern "C" __global__ void getArtiDofStates(
const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
PxReal* PX_RESTRICT data,
const PxArticulationGPUIndex* PX_RESTRICT index,
const PxU32 nbElements,
const PxU32 maxDofs,
PxArticulationGPUAPIReadType::Enum type
)
{
// 1 thread - 1 dof.
// input has maxDofs * PxReal * nbArticulations size.
const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
const PxU32 groupIndex = globalThreadIndex / maxDofs;
const PxU32 dofIndex = globalThreadIndex % maxDofs;
if (groupIndex < nbElements)
{
const PxArticulationGPUIndex articulationIndex = index[groupIndex];
const PxgArticulation& articulation = scDesc->articulations[articulationIndex];
PxReal* PX_RESTRICT dstData = &data[groupIndex * maxDofs];
const PxReal* PX_RESTRICT srcData;
switch(type)
{
case PxArticulationGPUAPIReadType::eJOINT_POSITION:
{
srcData = articulation.jointPositions;
break;
}
case PxArticulationGPUAPIReadType::eJOINT_VELOCITY:
{
srcData = articulation.jointVelocities;
break;
}
case PxArticulationGPUAPIReadType::eJOINT_ACCELERATION:
{
srcData = articulation.jointAccelerations;
break;
}
case PxArticulationGPUAPIReadType::eJOINT_FORCE:
{
srcData = articulation.jointForce;
break;
}
case PxArticulationGPUAPIReadType::eJOINT_TARGET_VELOCITY:
{
srcData = articulation.jointTargetVelocities;
break;
}
case PxArticulationGPUAPIReadType::eJOINT_TARGET_POSITION:
{
srcData = articulation.jointTargetPositions;
break;
}
default:
assert(0);
}
const PxU32 artiDofs = articulation.data.numJointDofs;
if (dofIndex < artiDofs)
{
dstData[dofIndex] = srcData[dofIndex];
}
}
}
extern "C" __global__ void getArtiTransformStates(
const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
PxTransform* PX_RESTRICT data,
const PxArticulationGPUIndex* PX_RESTRICT index,
const PxU32 nbElements,
const PxU32 maxLinks,
PxArticulationGPUAPIReadType::Enum type
)
{
// 1 thread - 1 link, 1 transform
const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
const PxU32 groupIndex = globalThreadIndex / maxLinks;
const PxU32 linkIndex = globalThreadIndex % maxLinks;
if (groupIndex < nbElements)
{
const PxArticulationGPUIndex articulationIndex = index[groupIndex];
const PxgArticulation& articulation = scDesc->articulations[articulationIndex];
PxU32 numLinks = (type == PxArticulationGPUAPIReadType::eROOT_GLOBAL_POSE) ? 1 : articulation.data.numLinks;
PxTransform* dstData = &data[groupIndex * maxLinks];
if (linkIndex < numLinks)
{
const PxTransform body2Actor = articulation.linkBody2Actors[linkIndex];
const PxTransform body2World = articulation.linkBody2Worlds[linkIndex];
dstData[linkIndex] = body2World * body2Actor.getInverse();
}
}
}
extern "C" __global__ void getArtiVelocityStates(
const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
PxVec3* PX_RESTRICT data,
const PxArticulationGPUIndex* PX_RESTRICT index,
const PxU32 nbElements,
const PxU32 maxLinks,
PxArticulationGPUAPIReadType::Enum type
)
{
// 1 thread - 1 vec3 element.
const PxU32 threadPerGroup = 3u;
const PxU32 threadPerArticulation = threadPerGroup * maxLinks;
const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
const PxU32 groupIndex = globalThreadIndex / threadPerArticulation;
const PxU32 localIndex = globalThreadIndex % threadPerArticulation;
const PxU32 linkIndex = localIndex / threadPerGroup;
const PxU32 elementIndex = localIndex % threadPerGroup;
if (groupIndex < nbElements)
{
const PxArticulationGPUIndex articulationIndex = index[groupIndex];
const PxgArticulation& articulation = scDesc->articulations[articulationIndex];
PxU32 numLinks;
const PxReal* PX_RESTRICT srcData;
switch (type)
{
case PxArticulationGPUAPIReadType::eROOT_LINEAR_VELOCITY:
{
srcData = reinterpret_cast<const PxReal*>(articulation.motionVelocities) + 3;
numLinks = 1;
break;
}
case PxArticulationGPUAPIReadType::eROOT_ANGULAR_VELOCITY:
{
srcData = reinterpret_cast<const PxReal*>(articulation.motionVelocities);
numLinks = 1;
break;
}
case PxArticulationGPUAPIReadType::eLINK_LINEAR_VELOCITY:
{
srcData = reinterpret_cast<const PxReal*>(articulation.motionVelocities) + 3;
numLinks = articulation.data.numLinks;
break;
}
case PxArticulationGPUAPIReadType::eLINK_ANGULAR_VELOCITY:
{
srcData = reinterpret_cast<const PxReal*>(articulation.motionVelocities);
numLinks = articulation.data.numLinks;
break;
}
case PxArticulationGPUAPIReadType::eLINK_LINEAR_ACCELERATION:
{
srcData = reinterpret_cast<const PxReal*>(articulation.motionAccelerations) + 3;
numLinks = articulation.data.numLinks;
break;
}
case PxArticulationGPUAPIReadType::eLINK_ANGULAR_ACCELERATION:
{
srcData = reinterpret_cast<const PxReal*>(articulation.motionAccelerations);
numLinks = articulation.data.numLinks;
break;
}
default:
assert(0);
}
if (linkIndex < numLinks)
{
PxReal* PX_RESTRICT dstData = reinterpret_cast<PxReal*>(&data[groupIndex * maxLinks + linkIndex]);
const PxReal* PX_RESTRICT srcDataU = &srcData[linkIndex * threadPerGroup * 2]; // careful because source is Cm::UnAlignedSpatialVector
if (elementIndex < threadPerGroup)
{
dstData[elementIndex] = srcDataU[elementIndex];
}
}
}
}
extern "C" __global__ void getArtiSpatialForceStates(
const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
PxReal* PX_RESTRICT data,
const PxArticulationGPUIndex* PX_RESTRICT gpuIndices,
const PxU32 nbElements,
const PxU32 maxLinks
)
{
// 1 thread - 1 element of Cm::UnAlignedSpatialVector.
// AD sizeof is probably evaluated at compile time, so just use sizeof for the constant?
PX_COMPILE_TIME_ASSERT((sizeof(Cm::UnAlignedSpatialVector) / 4) == 6u);
//we need 6 threads for the velocities - for each link
const PxU32 threadPerGroup = 6u;
const PxU32 threadPerArticulation = threadPerGroup * maxLinks;
const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
const PxU32 groupIndex = globalThreadIndex / threadPerArticulation;
const PxU32 localIndex = globalThreadIndex % threadPerArticulation;
const PxU32 linkIndex = localIndex / threadPerGroup;
const PxU32 elementIndex = localIndex % threadPerGroup;
if (groupIndex < nbElements)
{
const PxArticulationGPUIndex articulationIndex = gpuIndices[groupIndex];
const PxgArticulation& articulation = scDesc->articulations[articulationIndex];
const Cm::UnAlignedSpatialVector* PX_RESTRICT srcData = articulation.linkIncomingJointForces;
const PxU32 numLinks = articulation.data.numLinks;
if (linkIndex < numLinks)
{
// 6 PxReal per link
PxReal* PX_RESTRICT dstData = reinterpret_cast<PxReal*>(&data[groupIndex * threadPerArticulation + linkIndex * threadPerGroup]);
const PxReal* PX_RESTRICT srcDataU = reinterpret_cast<const PxReal*>(&srcData[linkIndex]);
if (elementIndex < 6u)
{
dstData[elementIndex] = srcDataU[elementIndex];
}
}
}
}
extern "C" __global__ void setArtiDofStates(
const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
const PxReal* PX_RESTRICT data,
const PxArticulationGPUIndex* PX_RESTRICT gpuIndices,
const PxU32 nbElements,
const PxU32 maxDofs,
PxArticulationGPUAPIWriteType::Enum type
)
{
// 1 thread - 1 dof.
// input has maxDofs * PxReal * nbElements size.
const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
const PxU32 groupIndex = globalThreadIndex / maxDofs;
const PxU32 dofIndex = globalThreadIndex % maxDofs;
if (groupIndex < nbElements)
{
const PxArticulationGPUIndex articulationIndex = gpuIndices[groupIndex];
PxgArticulation& articulation = scDesc->articulations[articulationIndex];
PxReal* PX_RESTRICT dstData;
switch (type)
{
case (PxArticulationGPUAPIWriteType::eJOINT_POSITION):
{
dstData = articulation.jointPositions;
if (dofIndex == 0)
articulation.data.gpuDirtyFlag |= (Dy::ArticulationDirtyFlag::eDIRTY_POSITIONS | Dy::ArticulationDirtyFlag::eNEEDS_KINEMATIC_UPDATE);
break;
}
case (PxArticulationGPUAPIWriteType::eJOINT_VELOCITY):
{
dstData = articulation.jointVelocities;
if (dofIndex == 0)
articulation.data.gpuDirtyFlag |= (Dy::ArticulationDirtyFlag::eDIRTY_VELOCITIES | Dy::ArticulationDirtyFlag::eNEEDS_KINEMATIC_UPDATE);
break;
}
case (PxArticulationGPUAPIWriteType::eJOINT_FORCE):
{
dstData = articulation.jointForce;
if (dofIndex == 0)
articulation.data.gpuDirtyFlag |= Dy::ArticulationDirtyFlag::eDIRTY_FORCES;
break;
}
case (PxArticulationGPUAPIWriteType::eJOINT_TARGET_POSITION):
{
dstData = articulation.jointTargetPositions;
if (dofIndex == 0)
articulation.data.gpuDirtyFlag |= Dy::ArticulationDirtyFlag::eDIRTY_JOINT_TARGET_POS;
break;
}
case (PxArticulationGPUAPIWriteType::eJOINT_TARGET_VELOCITY):
{
dstData = articulation.jointTargetVelocities;
if (dofIndex == 0)
articulation.data.gpuDirtyFlag |= Dy::ArticulationDirtyFlag::eDIRTY_JOINT_TARGET_VEL;
break;
}
default:
assert(false);
}
const PxU32 artiDofs = articulation.data.numJointDofs;
const PxReal* PX_RESTRICT srcData = &data[groupIndex * maxDofs];
if (dofIndex < artiDofs)
{
dstData[dofIndex] = srcData[dofIndex];
}
}
}
extern "C" __global__ void setArtiRootGlobalPoseState(
const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
const PxTransform* PX_RESTRICT data,
const PxArticulationGPUIndex* PX_RESTRICT index,
const PxU32 nbElements
)
{
// 1 thread - 1 transform value.
// input has PxTransform * nbArticulations size.
const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
// currently 1 thread does the new calc.
if (globalThreadIndex < nbElements)
{
const PxArticulationGPUIndex articulationIndex = index[globalThreadIndex];
PxgArticulation& articulation = scDesc->articulations[articulationIndex];
articulation.data.gpuDirtyFlag |= (Dy::ArticulationDirtyFlag::eDIRTY_ROOT_TRANSFORM | Dy::ArticulationDirtyFlag::eNEEDS_KINEMATIC_UPDATE);
const PxTransform actorPose = data[globalThreadIndex];
const PxTransform body2Actor = articulation.linkBody2Actors[0]; // TODO AD: this could be outdated - how do we resolve this?
const PxTransform pose = actorPose * body2Actor;
articulation.linkBody2Worlds[0] = pose;
}
}
extern "C" __global__ void setArtiRootVelocityState(
const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
const PxVec3* PX_RESTRICT data,
const PxArticulationGPUIndex* PX_RESTRICT index,
const PxU32 nbElements,
const PxArticulationGPUAPIWriteType::Enum operation
)
{
// 1 thread - 1 vec3 element.
const PxU32 threadPerArticulation = 3u;
const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
const PxU32 artiIndex = globalThreadIndex / threadPerArticulation;
const PxU32 localIndex = globalThreadIndex % threadPerArticulation;
if (artiIndex < nbElements)
{
const PxArticulationGPUIndex articulationIndex = index[artiIndex];
PxgArticulation& articulation = scDesc->articulations[articulationIndex];
if(localIndex == 0)
articulation.data.gpuDirtyFlag |= (Dy::ArticulationDirtyFlag::eDIRTY_ROOT_VELOCITIES | Dy::ArticulationDirtyFlag::eNEEDS_KINEMATIC_UPDATE);
PxReal* dstData;
switch (operation)
{
case PxArticulationGPUAPIWriteType::eROOT_LINEAR_VELOCITY:
{
dstData = reinterpret_cast<PxReal*>(articulation.motionVelocities) + 3;
break;
}
case PxArticulationGPUAPIWriteType::eROOT_ANGULAR_VELOCITY:
{
dstData = reinterpret_cast<PxReal*>(articulation.motionVelocities);
break;
}
default:
assert(0);
}
const PxReal* PX_RESTRICT srcData = reinterpret_cast<const PxReal*>(&data[artiIndex]);
if (localIndex < threadPerArticulation)
{
dstData[localIndex] = srcData[localIndex];
}
}
}
extern "C" __global__ void setArtiLinkForceState(
const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
const PxVec3* PX_RESTRICT data,
const PxArticulationGPUIndex* PX_RESTRICT index,
const PxU32 nbElements,
const PxU32 maxLinks
)
{
// 1 thread - 1 float element
const PxU32 threadPerGroup = 3u; // PxVec3
const PxU32 threadPerArticulation = threadPerGroup * maxLinks;
const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
const PxU32 artiSourceIndex = globalThreadIndex / threadPerArticulation;
const PxU32 localIndex = globalThreadIndex % threadPerArticulation;
const PxU32 linkIndex = localIndex / threadPerGroup;
const PxU32 elementIndex = localIndex % threadPerGroup;
if (artiSourceIndex < nbElements)
{
const PxArticulationGPUIndex articulationIndex = index[artiSourceIndex];
const PxgArticulation& articulation = scDesc->articulations[articulationIndex];
const PxgArticulationLinkProp* const PX_RESTRICT linkProps = articulation.linkProps;
Cm::UnAlignedSpatialVector* externalAccel = articulation.externalAccelerations;
const PxU32 artiNumLinks = articulation.data.numLinks;
const PxVec3* PX_RESTRICT srcData = &data[artiSourceIndex * maxLinks];
if (linkIndex < artiNumLinks)
{
if (elementIndex < threadPerGroup)
{
const float4 invInertiaXYZ_invMass = linkProps[linkIndex].invInertiaXYZ_invMass;
const PxReal* forces = reinterpret_cast<const PxReal*>(&srcData[linkIndex]);
PxReal* dst = reinterpret_cast<PxReal*>(&externalAccel[linkIndex]);
dst[elementIndex] = forces[elementIndex] * invInertiaXYZ_invMass.w;
}
}
}
}
extern "C" __global__ void setArtiLinkTorqueState(
const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
const PxVec3* PX_RESTRICT data,
const PxArticulationGPUIndex* PX_RESTRICT gpuIndices,
const PxU32 nbElements,
const PxU32 maxLinks
)
{
// 1 thread - 1 link.
const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
const PxU32 groupIndex = globalThreadIndex / maxLinks;
const PxU32 linkIndex = globalThreadIndex % maxLinks;
if (groupIndex < nbElements)
{
const PxArticulationGPUIndex articulationIndex = gpuIndices[groupIndex];
PxgArticulation& articulation = scDesc->articulations[articulationIndex];
const PxgArticulationLinkProp* const PX_RESTRICT linkProps = articulation.linkProps;
Cm::UnAlignedSpatialVector* externalAccel = articulation.externalAccelerations;
const PxTransform* PX_RESTRICT body2Worlds = articulation.linkBody2Worlds;
const PxVec3* PX_RESTRICT srcData = &data[groupIndex * maxLinks];
const PxU32 artiNumLinks = articulation.data.numLinks;
if (linkIndex < artiNumLinks)
{
const PxQuat& q = body2Worlds[linkIndex].q;
const PxVec3& linkTorque = srcData[linkIndex];
const PxVec3 localLinkTorque = q.rotateInv(linkTorque);
//turn localLinkTorque into acceleration in local frame
const PxVec3 invInertia = PxLoad3(linkProps[linkIndex].invInertiaXYZ_invMass);
const PxVec3 localAccel = invInertia.multiply(localLinkTorque);
//turn the localAccel into world space
const PxVec3 worldAccel = q.rotate(localAccel);
externalAccel[linkIndex].bottom = PxVec3(worldAccel.x, worldAccel.y, worldAccel.z);
}
}
}
// AD: might make sense to split this to more threads!
extern "C" __global__ void setArtiTendonState(
const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
const void* PX_RESTRICT data,
const PxArticulationGPUIndex* PX_RESTRICT gpuIndices,
const PxU32 nbElements,
const PxU32 maxTendons,
const PxArticulationGPUAPIWriteType::Enum operation
)
{
const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
const PxU32 groupIndex = globalThreadIndex / maxTendons;
const PxU32 tendonIndex = globalThreadIndex % maxTendons;
if (groupIndex < nbElements)
{
const PxArticulationGPUIndex articulationIndex = gpuIndices[groupIndex];
PxgArticulation& articulation = scDesc->articulations[articulationIndex];
switch (operation)
{
case (PxArticulationGPUAPIWriteType::eSPATIAL_TENDON):
{
const PxU32 numSpatialTendons = articulation.data.numSpatialTendons;
if (tendonIndex < numSpatialTendons)
{
const PxGpuSpatialTendonData* srcData = &(reinterpret_cast<const PxGpuSpatialTendonData*>(data)[groupIndex * maxTendons]);
articulation.spatialTendonParams[tendonIndex] = srcData[tendonIndex];
}
if (tendonIndex == 0)
articulation.data.gpuDirtyFlag |= Dy::ArticulationDirtyFlag::eDIRTY_SPATIAL_TENDON;
break;
}
case (PxArticulationGPUAPIWriteType::eFIXED_TENDON):
{
const PxU32 numFixedTendons = articulation.data.numFixedTendons;
if (tendonIndex < numFixedTendons)
{
const PxGpuFixedTendonData* srcData = &(reinterpret_cast<const PxGpuFixedTendonData*>(data)[groupIndex * maxTendons]);
articulation.fixedTendonParams[tendonIndex] = srcData[tendonIndex];
}
if (tendonIndex == 0)
articulation.data.gpuDirtyFlag |= Dy::ArticulationDirtyFlag::eDIRTY_FIXED_TENDON;
break;
}
default:
assert(false);
}
}
}
extern "C" __global__ void getArtiTendonState(
const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
void* PX_RESTRICT data,
const PxArticulationGPUIndex* PX_RESTRICT gpuIndices,
const PxU32 nbElements,
const PxU32 maxTendons,
const PxArticulationGPUAPIReadType::Enum operation
)
{
const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
const PxU32 groupIndex = globalThreadIndex / maxTendons;
const PxU32 tendonIndex = globalThreadIndex % maxTendons;
if (groupIndex < nbElements)
{
const PxArticulationGPUIndex articulationIndex = gpuIndices[groupIndex];
const PxgArticulation& articulation = scDesc->articulations[articulationIndex];
switch (operation)
{
case (PxArticulationGPUAPIReadType::eSPATIAL_TENDON):
{
const PxU32 numSpatialTendons = articulation.data.numSpatialTendons;
if (tendonIndex < numSpatialTendons)
{
PxGpuSpatialTendonData* dstData = &(reinterpret_cast<PxGpuSpatialTendonData*>(data)[groupIndex * maxTendons]);
dstData[tendonIndex] = articulation.spatialTendonParams[tendonIndex];
}
break;
}
case (PxArticulationGPUAPIReadType::eFIXED_TENDON):
{
const PxU32 numFixedTendons = articulation.data.numFixedTendons;
if (tendonIndex < numFixedTendons)
{
PxGpuFixedTendonData* dstData = &(reinterpret_cast<PxGpuFixedTendonData*>(data)[groupIndex * maxTendons]);
dstData[tendonIndex] = articulation.fixedTendonParams[tendonIndex];
}
break;
}
default:
assert(false);
}
}
}
extern "C" __global__ void setArtiSpatialTendonAttachmentState(
const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
const PxGpuTendonAttachmentData* PX_RESTRICT data,
const PxArticulationGPUIndex* PX_RESTRICT gpuIndices,
const PxU32 nbElements,
const PxU32 maxTendons
)
{
const PxU32 maxTendonAttachments = scDesc->mMaxAttachmentPerArticulation;
const PxU32 maxSpatialTendons = scDesc->mMaxSpatialTendonsPerArticulation;
const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
const PxU32 groupIndex = globalThreadIndex / maxTendons;
const PxU32 elementIndex = globalThreadIndex % maxTendons;
const PxU32 tendonIndex = elementIndex / maxTendonAttachments;
const PxU32 attachmentIndex = elementIndex % maxTendonAttachments;
if (groupIndex < nbElements)
{
const PxArticulationGPUIndex articulationIndex = gpuIndices[groupIndex];
PxgArticulation& articulation = scDesc->articulations[articulationIndex];
if (elementIndex == 0)
articulation.data.gpuDirtyFlag |= Dy::ArticulationDirtyFlag::eDIRTY_SPATIAL_TENDON_ATTACHMENT;
const PxU32 numSpatialTendons = articulation.data.numSpatialTendons;
if (tendonIndex < numSpatialTendons)
{
PxgArticulationTendon& tendon = articulation.spatialTendons[tendonIndex];
const PxGpuTendonAttachmentData* srcData = &data[groupIndex * maxSpatialTendons * maxTendonAttachments];
PxGpuTendonAttachmentData* attachData = reinterpret_cast<PxGpuTendonAttachmentData*>(tendon.mModElements);
const PxU32 numTendonAttachments = tendon.mNbElements;
if (attachmentIndex < numTendonAttachments)
{
//PxGpuTendonAttachmentData is 32 byte
PX_COMPILE_TIME_ASSERT(sizeof(PxGpuTendonAttachmentData) == 32);
const PxU32 numIteration = sizeof(PxGpuTendonAttachmentData) / sizeof(uint4);
const uint4* tData = reinterpret_cast<const uint4*>(&srcData[attachmentIndex]);
uint4* aData = reinterpret_cast<uint4*>(&attachData[attachmentIndex]);
for (PxU32 i = 0; i < numIteration; ++i)
{
aData[i] = tData[i];
}
}
}
}
}
extern "C" __global__ void getArtiSpatialTendonAttachmentState(
const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
PxGpuTendonAttachmentData* PX_RESTRICT data,
const PxArticulationGPUIndex* PX_RESTRICT gpuIndices,
const PxU32 nbElements,
const PxU32 maxTendons
)
{
const PxU32 maxTendonAttachments = scDesc->mMaxAttachmentPerArticulation;
const PxU32 maxSpatialTendons = scDesc->mMaxSpatialTendonsPerArticulation;
const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
const PxU32 groupIndex = globalThreadIndex / maxTendons;
const PxU32 elementIndex = globalThreadIndex % maxTendons;
const PxU32 tendonIndex = elementIndex / maxTendonAttachments;
const PxU32 attachmentIndex = elementIndex % maxTendonAttachments;
if (groupIndex < nbElements)
{
const PxArticulationGPUIndex articulationIndex = gpuIndices[groupIndex];
const PxgArticulation& articulation = scDesc->articulations[articulationIndex];
const PxU32 numSpatialTendons = articulation.data.numSpatialTendons;
if (tendonIndex < numSpatialTendons)
{
const PxgArticulationTendon& tendon = articulation.spatialTendons[tendonIndex];
PxGpuTendonAttachmentData* dstData = &data[groupIndex * maxSpatialTendons * maxTendonAttachments];
const PxGpuTendonAttachmentData* attachData = reinterpret_cast<const PxGpuTendonAttachmentData*>(tendon.mModElements);
const PxU32 numTendonAttachments = tendon.mNbElements;
if (attachmentIndex < numTendonAttachments)
{
//PxGpuTendonAttachmentData is 32 byte
PX_COMPILE_TIME_ASSERT(sizeof(PxGpuTendonAttachmentData) == 32);
const PxU32 numIteration = sizeof(PxGpuTendonAttachmentData) / sizeof(uint4);
uint4* tData = reinterpret_cast<uint4*>(&dstData[attachmentIndex]);
const uint4* aData = reinterpret_cast<const uint4*>(&attachData[attachmentIndex]);
for (PxU32 i = 0; i < numIteration; ++i)
{
tData[i] = aData[i];
}
}
}
}
}
extern "C" __global__ void setArtiFixedTendonJointState(
const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
const PxGpuTendonJointCoefficientData* PX_RESTRICT data,
const PxArticulationGPUIndex* PX_RESTRICT gpuIndices,
const PxU32 nbElements
)
{
const PxU32 maxFixedTendons = scDesc->mMaxFixedTendonsPerArticulation;
const PxU32 maxTendonJoints = scDesc->mMaxTendonJointPerArticulation;
const PxU32 maxTendons = maxFixedTendons * maxTendonJoints;
const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
const PxU32 groupIndex = globalThreadIndex / maxTendons;
const PxU32 elementIndex = globalThreadIndex % maxTendons;
const PxU32 tendonIndex = elementIndex / maxTendonJoints;
const PxU32 tendonJointIndex = elementIndex % maxTendonJoints;
if (groupIndex < nbElements)
{
const PxArticulationGPUIndex articulationIndex = gpuIndices[groupIndex];
PxgArticulation& articulation = scDesc->articulations[articulationIndex];
if (elementIndex == 0)
articulation.data.gpuDirtyFlag |= Dy::ArticulationDirtyFlag::eDIRTY_FIXED_TENDON_JOINT;
PxgArticulationTendon* fixedTendons = articulation.fixedTendons;
const PxU32 numFixedTendonJoints = articulation.data.numFixedTendons;
if (tendonIndex < numFixedTendonJoints)
{
PxgArticulationTendon& tendon = fixedTendons[tendonIndex];
const PxGpuTendonJointCoefficientData* srcData = &data[groupIndex * maxFixedTendons * maxTendonJoints];
PxGpuTendonJointCoefficientData* coefficientData = reinterpret_cast<PxGpuTendonJointCoefficientData*>(tendon.mModElements);
const PxU32 numTendonJoints = tendon.mNbElements;
if (tendonJointIndex < numTendonJoints)
{
//PxGpuTendonJointCoefficientData is 16 byte
PX_COMPILE_TIME_ASSERT(sizeof(PxGpuTendonJointCoefficientData) == 16);
const uint4 tData = reinterpret_cast<const uint4&>(srcData[tendonJointIndex]);
uint4& coefData = reinterpret_cast<uint4&>(coefficientData[tendonJointIndex]);
coefData = tData;
}
}
}
}
extern "C" __global__ void getArtiFixedTendonJointState(
const PxgArticulationCoreDesc* PX_RESTRICT scDesc,
PxGpuTendonJointCoefficientData* PX_RESTRICT data,
const PxArticulationGPUIndex* PX_RESTRICT gpuIndices,
const PxU32 nbElements
)
{
const PxU32 maxFixedTendons = scDesc->mMaxFixedTendonsPerArticulation;
const PxU32 maxTendonJoints = scDesc->mMaxTendonJointPerArticulation;
const PxU32 maxTendons = maxFixedTendons * maxTendonJoints;
const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
const PxU32 groupIndex = globalThreadIndex / maxTendons;
const PxU32 elementIndex = globalThreadIndex % maxTendons;
const PxU32 tendonIndex = elementIndex / maxTendonJoints;
const PxU32 tendonJointIndex = elementIndex % maxTendonJoints;
if (groupIndex < nbElements)
{
const PxArticulationGPUIndex articulationIndex = gpuIndices[groupIndex];
const PxgArticulation& articulation = scDesc->articulations[articulationIndex];
const PxgArticulationTendon* fixedTendons = articulation.fixedTendons;
const PxU32 numFixedTendonJoints = articulation.data.numFixedTendons;
if (tendonIndex < numFixedTendonJoints)
{
const PxgArticulationTendon& tendon = fixedTendons[tendonIndex];
PxGpuTendonJointCoefficientData* dstData = &data[groupIndex * maxFixedTendons * maxTendonJoints];
const PxGpuTendonJointCoefficientData* coefficientData = reinterpret_cast<const PxGpuTendonJointCoefficientData*>(tendon.mModElements);
const PxU32 numTendonJoints = tendon.mNbElements;
if (tendonJointIndex < numTendonJoints)
{
//PxGpuTendonJointCoefficientData is 16 byte
PX_COMPILE_TIME_ASSERT(sizeof(PxGpuTendonJointCoefficientData) == 16);
uint4& tData = reinterpret_cast<uint4&>(dstData[tendonJointIndex]);
const uint4& coefData = reinterpret_cast<const uint4&>(coefficientData[tendonJointIndex]);
tData = coefData;
}
}
}
}