// Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved. // Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. // Copyright (c) 2001-2004 NovodeX AG. All rights reserved. #include "PxDirectGPUAPI.h" #include "foundation/PxSimpleTypes.h" #include "foundation/PxVec3.h" #include "foundation/PxTransform.h" #include "foundation/PxMath.h" #include "PxArticulationFlag.h" #include "PxArticulationTendonData.h" #include "PxgArticulation.h" #include "PxgArticulationLink.h" #include "PxgArticulationCoreDesc.h" #include "PxgShapeSim.h" #include "PxsTransformCache.h" #include "DyArticulationCore.h" #include "DyArticulationJointCore.h" #include "DyFeatherstoneArticulation.h" #include "DyFeatherstoneArticulationUtils.h" #include "DyFeatherstoneArticulationJointData.h" #include "CmSpatialVector.h" #include "utils.cuh" #include "reduction.cuh" #include "updateCacheAndBound.cuh" #include using namespace physx; using namespace Dy; extern "C" __host__ void initArticulationKernels3() {} template static PX_FORCE_INLINE __device__ void updateKinematicInternal( const PxgArticulation& articulation, const PxU32 threadIndexInWarp, const PxgShapeSim* PX_RESTRICT gShapeSimPool, const PxgShape* PX_RESTRICT gConvexShapes, PxsCachedTransform* PX_RESTRICT gTransformCache, const PxNodeIndex* PX_RESTRICT gRigidNodeIndices,//sorted rigid body node index const PxU32* PX_RESTRICT gShapeIndices,//the corresponding shape index const PxU32 numShapes, PxBounds3* PX_RESTRICT bounds) { const PxgArticulationData artiData = articulation.data; const PxU32 linkCount = artiData.numLinks; const PxU32 bodySimIndex = artiData.bodySimIndex; PxQuat* tempNewParentToChilds = articulation.tempParentToChilds; PxVec3* tempRs = articulation.tempRs; PxTransform* body2Worlds = articulation.linkBody2Worlds; ArticulationJointCore* joints = articulation.joints; ArticulationJointCoreData* jointData = articulation.jointData; const PxQuat* const PX_RESTRICT relativeQuats = articulation.relativeQuat; const PxU32* const PX_RESTRICT parents = articulation.parents; const SpatialSubspaceMatrix* const PX_RESTRICT motionMatrix = articulation.motionMatrix; //each thread deals with a joint for (PxU32 linkID = 1 + threadIndexInWarp; linkID < linkCount; linkID += WARP_SIZE) { const ArticulationJointCoreData& jointDatum = jointData[linkID]; const ArticulationJointCore& joint = joints[linkID]; const PxReal* jPosition = &articulation.jointPositions[jointDatum.jointOffset]; PxQuat& newParentToChild = tempNewParentToChilds[linkID]; PxVec3& r = tempRs[linkID]; const PxVec3 childOffset = -joint.childPose.p; const PxVec3 parentOffset = joint.parentPose.p; const PxQuat relativeQuat = relativeQuats[linkID]; PxVec3 e, d; switch (joint.jointType) { case PxArticulationJointType::ePRISMATIC: { newParentToChild = relativeQuat; const PxVec3& u = motionMatrix[linkID][0].bottom; const PxVec3 e = newParentToChild.rotate(parentOffset); const PxVec3 d = childOffset; r = e + d + u * jPosition[0]; break; } case PxArticulationJointType::eREVOLUTE: case PxArticulationJointType::eREVOLUTE_UNWRAPPED: { const PxVec3& u = motionMatrix[linkID][0].top; PxQuat jointRotation = PxQuat(-jPosition[0], u); if (jointRotation.w < 0) //shortest angle. jointRotation = -jointRotation; /*printf("LinkID %i: jointRotation = (%f, %f, %f, %f), joint->relativeQuat = (%f, %f, %f,%f)\n", linkID, jointRotation.x, jointRotation.y, jointRotation.z, jointRotation.w, joint.relativeQuat.x, joint.relativeQuat.y, joint.relativeQuat.z, joint.relativeQuat.w);*/ newParentToChild = (jointRotation * relativeQuat).getNormalized(); const PxVec3 e = newParentToChild.rotate(parentOffset); const PxVec3 d = childOffset; r = e + d; assert(r.isFinite()); break; } case PxArticulationJointType::eSPHERICAL: { PxQuat jointRotation(PxIdentity); PxVec3 ang(0.f); for (PxU32 d = 0; d < jointDatum.nbDof; ++d) { ang += motionMatrix[linkID][d].top * -jPosition[d]; } PxReal angle = ang.normalize(); jointRotation = angle < 1e-10f ? PxQuat(PxIdentity) : PxQuat(angle, ang); if(jointRotation.w < 0.f) jointRotation = -jointRotation; newParentToChild = (jointRotation * relativeQuat).getNormalized(); const PxVec3 e = newParentToChild.rotate(parentOffset); const PxVec3 d = childOffset; r = e + d; break; } case PxArticulationJointType::eFIX: { //this is fix joint so joint don't have velocity newParentToChild = relativeQuat; const PxVec3 e = newParentToChild.rotate(parentOffset); const PxVec3 d = childOffset; r = e + d; break; } default: break; } } __syncwarp(); if (threadIndexInWarp == 0) { for (PxU32 linkID = 1; linkID < linkCount; ++linkID) { const PxU32 parent = parents[linkID]; const PxTransform pBody2World = body2Worlds[parent]; PxQuat& newParentToChild = tempNewParentToChilds[linkID]; PxVec3& r = tempRs[linkID]; PxTransform& body2World = body2Worlds[linkID]; body2World.q = (pBody2World.q * newParentToChild.getConjugate()).getNormalized(); body2World.p = pBody2World.p + body2World.q.rotate(r); // we do NOT calculate updated link velocities if this is the automatic pre-sim updateKinematic call, because we will immediately do it again because we also need to clamp the joint velocities. // if this was a velocity-only update we already skip in the parent function, so we only end up here if positions are dirty. if (zeroSimOutput) { // link velocity update - unfortunately also dependent on parent position and velocity. Cm::UnAlignedSpatialVector parentVel = articulation.motionVelocities[parent]; const PxVec3 c2p = body2World.p - pBody2World.p; Cm::UnAlignedSpatialVector linkVelocity = FeatherstoneArticulation::translateSpatialVector(-c2p, parentVel); // AD unfortunately this is more-or-less the same code as in computeLinkVelocities, minus the maxJointVel clamping. const ArticulationJointCoreData& jointDatum = jointData[linkID]; const PxReal* jVelocity = &articulation.jointVelocities[jointDatum.jointOffset]; for (PxU32 ind = 0; ind < jointDatum.nbDof; ++ind) { const Cm::UnAlignedSpatialVector worldCol = motionMatrix[linkID][ind].rotate(body2World); const PxReal jVel = jVelocity[ind]; linkVelocity += worldCol * jVel; } articulation.motionVelocities[linkID] = linkVelocity; } } } __syncwarp(); if (zeroSimOutput) // AD: we could potentially ingest this into the link traversal++, but let's not get ahead of ourselves. { const PxU32 numLinks = articulation.data.numLinks; const PxU32 numDofs = articulation.data.numJointDofs; PxReal* PX_RESTRICT linkAccelData = reinterpret_cast(articulation.motionAccelerations); PxReal* PX_RESTRICT linkIncomingJointForceData = reinterpret_cast(articulation.linkIncomingJointForces); const PxU32 numRealsForSpatialVector = sizeof(Cm::UnAlignedSpatialVector) / 4; const PxU32 maxLinksReal = articulation.data.numLinks * numRealsForSpatialVector; const PxU32 linkCountReal = numLinks * numRealsForSpatialVector; for (PxU32 index = threadIndexInWarp; index < PxMax(numDofs, maxLinksReal); index += WARP_SIZE) { if (index < linkCountReal) { linkAccelData[index] = 0.f; linkIncomingJointForceData[index] = 0.f; } if (index < numDofs) { articulation.jointAccelerations[index] = 0.f; } } } if (numShapes == 0) { // guard against no actors have shapes in scene, in which case gRigidNodeIndices, etc. would have zero elements return; } __syncwarp(); const PxTransform* PX_RESTRICT linkBody2Actors = articulation.linkBody2Actors; //each thread deals with a link for (PxU32 linkID = threadIndexInWarp; linkID < linkCount; linkID += WARP_SIZE) { const PxNodeIndex linkNodeIndex(bodySimIndex, linkID); const PxTransform body2World = body2Worlds[linkID]; const PxTransform body2Actor = linkBody2Actors[linkID]; //this will search for the first pos for the matched node index PxU32 pos = binarySearch(gRigidNodeIndices, numShapes, linkNodeIndex); // go backward through the sorted rigid node index array which has an entry for each // shape of the link, and update the shape if it belongs to the link, i.e. the rigid // node index matches the link node index while (pos != 0xFFffFFff && gRigidNodeIndices[pos] == linkNodeIndex) { const PxU32 shapeIndex = gShapeIndices[pos]; if (shapeIndex != 0xFFffFFff) { const PxgShapeSim& shapeSim = gShapeSimPool[shapeIndex]; const PxTransform absPos = getAbsPose(body2World, shapeSim.mTransform, body2Actor); //update broad phase bound, transform cache updateCacheAndBound(absPos, shapeSim, shapeIndex, gTransformCache, bounds, gConvexShapes, true); } pos--; } } } //This function is called after user update gpu buffer(Dy::ArticulationDirtyFlag::eDIRTY_ROOT || Dy::ArticulationDirtyFlag::eDIRTY_POSITIONS) extern "C" __global__ void artiUpdateKinematic( const PxgArticulationCoreDesc* const PX_RESTRICT scDesc, const PxgShapeSim* PX_RESTRICT gShapeSimPool, const PxgShape* PX_RESTRICT gConvexShapes, PxsCachedTransform* PX_RESTRICT gTransformCache, const PxNodeIndex* PX_RESTRICT gRigidNodeIndices, const PxU32* PX_RESTRICT gShapeIndices, const PxU32 numShapes, PxBounds3* PX_RESTRICT bounds, const PxArticulationGPUIndex* PX_RESTRICT gpuIndices, // NULL: process all the dirty articulations const PxU32 nbElements, // can be 0 in combination with NULL index buffer. bool zeroSimOutput) { // we launch blocks of 32x2 threads. 1 warp deals with 1 articulation. assert(blockDim.x == 32); assert(blockDim.y == 2); const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE - 1); const PxU32 globalWarpIndex = blockIdx.x * blockDim.y + threadIdx.y; const PxU32 warpsPerGrid = gridDim.x * blockDim.y; const PxU32 nbArticulations = gpuIndices ? nbElements : scDesc->nbArticulations; for (PxU32 i = globalWarpIndex; i < nbArticulations; i += warpsPerGrid) { const PxU32 articulationIndex = gpuIndices ? gpuIndices[i] : i; PxgArticulation& articulation = scDesc->articulations[articulationIndex]; if (articulation.data.gpuDirtyFlag & ArticulationDirtyFlag::eNEEDS_KINEMATIC_UPDATE) { // reset while in cache. articulation.data.gpuDirtyFlag &= ~(ArticulationDirtyFlag::eNEEDS_KINEMATIC_UPDATE); // if we only have dirty velocities and this is the pre-sim automatic call, we can skip that articulation. // The link velocities will be updated in computeLinkVelocities anyway. PxU32 mask = (ArticulationDirtyFlag::eDIRTY_ROOT_TRANSFORM | ArticulationDirtyFlag::eDIRTY_POSITIONS); bool positionsDirty = articulation.data.gpuDirtyFlag & mask; if (!zeroSimOutput && !positionsDirty) continue; if (zeroSimOutput) { updateKinematicInternal(articulation, threadIndexInWarp, gShapeSimPool, gConvexShapes, gTransformCache, gRigidNodeIndices, gShapeIndices, numShapes, bounds); } else { updateKinematicInternal(articulation, threadIndexInWarp, gShapeSimPool, gConvexShapes, gTransformCache, gRigidNodeIndices, gShapeIndices, numShapes, bounds); } } } } extern "C" __global__ void getArtiDofStates( const PxgArticulationCoreDesc* PX_RESTRICT scDesc, PxReal* PX_RESTRICT data, const PxArticulationGPUIndex* PX_RESTRICT index, const PxU32 nbElements, const PxU32 maxDofs, PxArticulationGPUAPIReadType::Enum type ) { // 1 thread - 1 dof. // input has maxDofs * PxReal * nbArticulations size. const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x; const PxU32 groupIndex = globalThreadIndex / maxDofs; const PxU32 dofIndex = globalThreadIndex % maxDofs; if (groupIndex < nbElements) { const PxArticulationGPUIndex articulationIndex = index[groupIndex]; const PxgArticulation& articulation = scDesc->articulations[articulationIndex]; PxReal* PX_RESTRICT dstData = &data[groupIndex * maxDofs]; const PxReal* PX_RESTRICT srcData; switch(type) { case PxArticulationGPUAPIReadType::eJOINT_POSITION: { srcData = articulation.jointPositions; break; } case PxArticulationGPUAPIReadType::eJOINT_VELOCITY: { srcData = articulation.jointVelocities; break; } case PxArticulationGPUAPIReadType::eJOINT_ACCELERATION: { srcData = articulation.jointAccelerations; break; } case PxArticulationGPUAPIReadType::eJOINT_FORCE: { srcData = articulation.jointForce; break; } case PxArticulationGPUAPIReadType::eJOINT_TARGET_VELOCITY: { srcData = articulation.jointTargetVelocities; break; } case PxArticulationGPUAPIReadType::eJOINT_TARGET_POSITION: { srcData = articulation.jointTargetPositions; break; } default: assert(0); } const PxU32 artiDofs = articulation.data.numJointDofs; if (dofIndex < artiDofs) { dstData[dofIndex] = srcData[dofIndex]; } } } extern "C" __global__ void getArtiTransformStates( const PxgArticulationCoreDesc* PX_RESTRICT scDesc, PxTransform* PX_RESTRICT data, const PxArticulationGPUIndex* PX_RESTRICT index, const PxU32 nbElements, const PxU32 maxLinks, PxArticulationGPUAPIReadType::Enum type ) { // 1 thread - 1 link, 1 transform const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x; const PxU32 groupIndex = globalThreadIndex / maxLinks; const PxU32 linkIndex = globalThreadIndex % maxLinks; if (groupIndex < nbElements) { const PxArticulationGPUIndex articulationIndex = index[groupIndex]; const PxgArticulation& articulation = scDesc->articulations[articulationIndex]; PxU32 numLinks = (type == PxArticulationGPUAPIReadType::eROOT_GLOBAL_POSE) ? 1 : articulation.data.numLinks; PxTransform* dstData = &data[groupIndex * maxLinks]; if (linkIndex < numLinks) { const PxTransform body2Actor = articulation.linkBody2Actors[linkIndex]; const PxTransform body2World = articulation.linkBody2Worlds[linkIndex]; dstData[linkIndex] = body2World * body2Actor.getInverse(); } } } extern "C" __global__ void getArtiVelocityStates( const PxgArticulationCoreDesc* PX_RESTRICT scDesc, PxVec3* PX_RESTRICT data, const PxArticulationGPUIndex* PX_RESTRICT index, const PxU32 nbElements, const PxU32 maxLinks, PxArticulationGPUAPIReadType::Enum type ) { // 1 thread - 1 vec3 element. const PxU32 threadPerGroup = 3u; const PxU32 threadPerArticulation = threadPerGroup * maxLinks; const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x; const PxU32 groupIndex = globalThreadIndex / threadPerArticulation; const PxU32 localIndex = globalThreadIndex % threadPerArticulation; const PxU32 linkIndex = localIndex / threadPerGroup; const PxU32 elementIndex = localIndex % threadPerGroup; if (groupIndex < nbElements) { const PxArticulationGPUIndex articulationIndex = index[groupIndex]; const PxgArticulation& articulation = scDesc->articulations[articulationIndex]; PxU32 numLinks; const PxReal* PX_RESTRICT srcData; switch (type) { case PxArticulationGPUAPIReadType::eROOT_LINEAR_VELOCITY: { srcData = reinterpret_cast(articulation.motionVelocities) + 3; numLinks = 1; break; } case PxArticulationGPUAPIReadType::eROOT_ANGULAR_VELOCITY: { srcData = reinterpret_cast(articulation.motionVelocities); numLinks = 1; break; } case PxArticulationGPUAPIReadType::eLINK_LINEAR_VELOCITY: { srcData = reinterpret_cast(articulation.motionVelocities) + 3; numLinks = articulation.data.numLinks; break; } case PxArticulationGPUAPIReadType::eLINK_ANGULAR_VELOCITY: { srcData = reinterpret_cast(articulation.motionVelocities); numLinks = articulation.data.numLinks; break; } case PxArticulationGPUAPIReadType::eLINK_LINEAR_ACCELERATION: { srcData = reinterpret_cast(articulation.motionAccelerations) + 3; numLinks = articulation.data.numLinks; break; } case PxArticulationGPUAPIReadType::eLINK_ANGULAR_ACCELERATION: { srcData = reinterpret_cast(articulation.motionAccelerations); numLinks = articulation.data.numLinks; break; } default: assert(0); } if (linkIndex < numLinks) { PxReal* PX_RESTRICT dstData = reinterpret_cast(&data[groupIndex * maxLinks + linkIndex]); const PxReal* PX_RESTRICT srcDataU = &srcData[linkIndex * threadPerGroup * 2]; // careful because source is Cm::UnAlignedSpatialVector if (elementIndex < threadPerGroup) { dstData[elementIndex] = srcDataU[elementIndex]; } } } } extern "C" __global__ void getArtiSpatialForceStates( const PxgArticulationCoreDesc* PX_RESTRICT scDesc, PxReal* PX_RESTRICT data, const PxArticulationGPUIndex* PX_RESTRICT gpuIndices, const PxU32 nbElements, const PxU32 maxLinks ) { // 1 thread - 1 element of Cm::UnAlignedSpatialVector. // AD sizeof is probably evaluated at compile time, so just use sizeof for the constant? PX_COMPILE_TIME_ASSERT((sizeof(Cm::UnAlignedSpatialVector) / 4) == 6u); //we need 6 threads for the velocities - for each link const PxU32 threadPerGroup = 6u; const PxU32 threadPerArticulation = threadPerGroup * maxLinks; const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x; const PxU32 groupIndex = globalThreadIndex / threadPerArticulation; const PxU32 localIndex = globalThreadIndex % threadPerArticulation; const PxU32 linkIndex = localIndex / threadPerGroup; const PxU32 elementIndex = localIndex % threadPerGroup; if (groupIndex < nbElements) { const PxArticulationGPUIndex articulationIndex = gpuIndices[groupIndex]; const PxgArticulation& articulation = scDesc->articulations[articulationIndex]; const Cm::UnAlignedSpatialVector* PX_RESTRICT srcData = articulation.linkIncomingJointForces; const PxU32 numLinks = articulation.data.numLinks; if (linkIndex < numLinks) { // 6 PxReal per link PxReal* PX_RESTRICT dstData = reinterpret_cast(&data[groupIndex * threadPerArticulation + linkIndex * threadPerGroup]); const PxReal* PX_RESTRICT srcDataU = reinterpret_cast(&srcData[linkIndex]); if (elementIndex < 6u) { dstData[elementIndex] = srcDataU[elementIndex]; } } } } extern "C" __global__ void setArtiDofStates( const PxgArticulationCoreDesc* PX_RESTRICT scDesc, const PxReal* PX_RESTRICT data, const PxArticulationGPUIndex* PX_RESTRICT gpuIndices, const PxU32 nbElements, const PxU32 maxDofs, PxArticulationGPUAPIWriteType::Enum type ) { // 1 thread - 1 dof. // input has maxDofs * PxReal * nbElements size. const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x; const PxU32 groupIndex = globalThreadIndex / maxDofs; const PxU32 dofIndex = globalThreadIndex % maxDofs; if (groupIndex < nbElements) { const PxArticulationGPUIndex articulationIndex = gpuIndices[groupIndex]; PxgArticulation& articulation = scDesc->articulations[articulationIndex]; PxReal* PX_RESTRICT dstData; switch (type) { case (PxArticulationGPUAPIWriteType::eJOINT_POSITION): { dstData = articulation.jointPositions; if (dofIndex == 0) articulation.data.gpuDirtyFlag |= (Dy::ArticulationDirtyFlag::eDIRTY_POSITIONS | Dy::ArticulationDirtyFlag::eNEEDS_KINEMATIC_UPDATE); break; } case (PxArticulationGPUAPIWriteType::eJOINT_VELOCITY): { dstData = articulation.jointVelocities; if (dofIndex == 0) articulation.data.gpuDirtyFlag |= (Dy::ArticulationDirtyFlag::eDIRTY_VELOCITIES | Dy::ArticulationDirtyFlag::eNEEDS_KINEMATIC_UPDATE); break; } case (PxArticulationGPUAPIWriteType::eJOINT_FORCE): { dstData = articulation.jointForce; if (dofIndex == 0) articulation.data.gpuDirtyFlag |= Dy::ArticulationDirtyFlag::eDIRTY_FORCES; break; } case (PxArticulationGPUAPIWriteType::eJOINT_TARGET_POSITION): { dstData = articulation.jointTargetPositions; if (dofIndex == 0) articulation.data.gpuDirtyFlag |= Dy::ArticulationDirtyFlag::eDIRTY_JOINT_TARGET_POS; break; } case (PxArticulationGPUAPIWriteType::eJOINT_TARGET_VELOCITY): { dstData = articulation.jointTargetVelocities; if (dofIndex == 0) articulation.data.gpuDirtyFlag |= Dy::ArticulationDirtyFlag::eDIRTY_JOINT_TARGET_VEL; break; } default: assert(false); } const PxU32 artiDofs = articulation.data.numJointDofs; const PxReal* PX_RESTRICT srcData = &data[groupIndex * maxDofs]; if (dofIndex < artiDofs) { dstData[dofIndex] = srcData[dofIndex]; } } } extern "C" __global__ void setArtiRootGlobalPoseState( const PxgArticulationCoreDesc* PX_RESTRICT scDesc, const PxTransform* PX_RESTRICT data, const PxArticulationGPUIndex* PX_RESTRICT index, const PxU32 nbElements ) { // 1 thread - 1 transform value. // input has PxTransform * nbArticulations size. const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x; // currently 1 thread does the new calc. if (globalThreadIndex < nbElements) { const PxArticulationGPUIndex articulationIndex = index[globalThreadIndex]; PxgArticulation& articulation = scDesc->articulations[articulationIndex]; articulation.data.gpuDirtyFlag |= (Dy::ArticulationDirtyFlag::eDIRTY_ROOT_TRANSFORM | Dy::ArticulationDirtyFlag::eNEEDS_KINEMATIC_UPDATE); const PxTransform actorPose = data[globalThreadIndex]; const PxTransform body2Actor = articulation.linkBody2Actors[0]; // TODO AD: this could be outdated - how do we resolve this? const PxTransform pose = actorPose * body2Actor; articulation.linkBody2Worlds[0] = pose; } } extern "C" __global__ void setArtiRootVelocityState( const PxgArticulationCoreDesc* PX_RESTRICT scDesc, const PxVec3* PX_RESTRICT data, const PxArticulationGPUIndex* PX_RESTRICT index, const PxU32 nbElements, const PxArticulationGPUAPIWriteType::Enum operation ) { // 1 thread - 1 vec3 element. const PxU32 threadPerArticulation = 3u; const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x; const PxU32 artiIndex = globalThreadIndex / threadPerArticulation; const PxU32 localIndex = globalThreadIndex % threadPerArticulation; if (artiIndex < nbElements) { const PxArticulationGPUIndex articulationIndex = index[artiIndex]; PxgArticulation& articulation = scDesc->articulations[articulationIndex]; if(localIndex == 0) articulation.data.gpuDirtyFlag |= (Dy::ArticulationDirtyFlag::eDIRTY_ROOT_VELOCITIES | Dy::ArticulationDirtyFlag::eNEEDS_KINEMATIC_UPDATE); PxReal* dstData; switch (operation) { case PxArticulationGPUAPIWriteType::eROOT_LINEAR_VELOCITY: { dstData = reinterpret_cast(articulation.motionVelocities) + 3; break; } case PxArticulationGPUAPIWriteType::eROOT_ANGULAR_VELOCITY: { dstData = reinterpret_cast(articulation.motionVelocities); break; } default: assert(0); } const PxReal* PX_RESTRICT srcData = reinterpret_cast(&data[artiIndex]); if (localIndex < threadPerArticulation) { dstData[localIndex] = srcData[localIndex]; } } } extern "C" __global__ void setArtiLinkForceState( const PxgArticulationCoreDesc* PX_RESTRICT scDesc, const PxVec3* PX_RESTRICT data, const PxArticulationGPUIndex* PX_RESTRICT index, const PxU32 nbElements, const PxU32 maxLinks ) { // 1 thread - 1 float element const PxU32 threadPerGroup = 3u; // PxVec3 const PxU32 threadPerArticulation = threadPerGroup * maxLinks; const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x; const PxU32 artiSourceIndex = globalThreadIndex / threadPerArticulation; const PxU32 localIndex = globalThreadIndex % threadPerArticulation; const PxU32 linkIndex = localIndex / threadPerGroup; const PxU32 elementIndex = localIndex % threadPerGroup; if (artiSourceIndex < nbElements) { const PxArticulationGPUIndex articulationIndex = index[artiSourceIndex]; const PxgArticulation& articulation = scDesc->articulations[articulationIndex]; const PxgArticulationLinkProp* const PX_RESTRICT linkProps = articulation.linkProps; Cm::UnAlignedSpatialVector* externalAccel = articulation.externalAccelerations; const PxU32 artiNumLinks = articulation.data.numLinks; const PxVec3* PX_RESTRICT srcData = &data[artiSourceIndex * maxLinks]; if (linkIndex < artiNumLinks) { if (elementIndex < threadPerGroup) { const float4 invInertiaXYZ_invMass = linkProps[linkIndex].invInertiaXYZ_invMass; const PxReal* forces = reinterpret_cast(&srcData[linkIndex]); PxReal* dst = reinterpret_cast(&externalAccel[linkIndex]); dst[elementIndex] = forces[elementIndex] * invInertiaXYZ_invMass.w; } } } } extern "C" __global__ void setArtiLinkTorqueState( const PxgArticulationCoreDesc* PX_RESTRICT scDesc, const PxVec3* PX_RESTRICT data, const PxArticulationGPUIndex* PX_RESTRICT gpuIndices, const PxU32 nbElements, const PxU32 maxLinks ) { // 1 thread - 1 link. const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x; const PxU32 groupIndex = globalThreadIndex / maxLinks; const PxU32 linkIndex = globalThreadIndex % maxLinks; if (groupIndex < nbElements) { const PxArticulationGPUIndex articulationIndex = gpuIndices[groupIndex]; PxgArticulation& articulation = scDesc->articulations[articulationIndex]; const PxgArticulationLinkProp* const PX_RESTRICT linkProps = articulation.linkProps; Cm::UnAlignedSpatialVector* externalAccel = articulation.externalAccelerations; const PxTransform* PX_RESTRICT body2Worlds = articulation.linkBody2Worlds; const PxVec3* PX_RESTRICT srcData = &data[groupIndex * maxLinks]; const PxU32 artiNumLinks = articulation.data.numLinks; if (linkIndex < artiNumLinks) { const PxQuat& q = body2Worlds[linkIndex].q; const PxVec3& linkTorque = srcData[linkIndex]; const PxVec3 localLinkTorque = q.rotateInv(linkTorque); //turn localLinkTorque into acceleration in local frame const PxVec3 invInertia = PxLoad3(linkProps[linkIndex].invInertiaXYZ_invMass); const PxVec3 localAccel = invInertia.multiply(localLinkTorque); //turn the localAccel into world space const PxVec3 worldAccel = q.rotate(localAccel); externalAccel[linkIndex].bottom = PxVec3(worldAccel.x, worldAccel.y, worldAccel.z); } } } // AD: might make sense to split this to more threads! extern "C" __global__ void setArtiTendonState( const PxgArticulationCoreDesc* PX_RESTRICT scDesc, const void* PX_RESTRICT data, const PxArticulationGPUIndex* PX_RESTRICT gpuIndices, const PxU32 nbElements, const PxU32 maxTendons, const PxArticulationGPUAPIWriteType::Enum operation ) { const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x; const PxU32 groupIndex = globalThreadIndex / maxTendons; const PxU32 tendonIndex = globalThreadIndex % maxTendons; if (groupIndex < nbElements) { const PxArticulationGPUIndex articulationIndex = gpuIndices[groupIndex]; PxgArticulation& articulation = scDesc->articulations[articulationIndex]; switch (operation) { case (PxArticulationGPUAPIWriteType::eSPATIAL_TENDON): { const PxU32 numSpatialTendons = articulation.data.numSpatialTendons; if (tendonIndex < numSpatialTendons) { const PxGpuSpatialTendonData* srcData = &(reinterpret_cast(data)[groupIndex * maxTendons]); articulation.spatialTendonParams[tendonIndex] = srcData[tendonIndex]; } if (tendonIndex == 0) articulation.data.gpuDirtyFlag |= Dy::ArticulationDirtyFlag::eDIRTY_SPATIAL_TENDON; break; } case (PxArticulationGPUAPIWriteType::eFIXED_TENDON): { const PxU32 numFixedTendons = articulation.data.numFixedTendons; if (tendonIndex < numFixedTendons) { const PxGpuFixedTendonData* srcData = &(reinterpret_cast(data)[groupIndex * maxTendons]); articulation.fixedTendonParams[tendonIndex] = srcData[tendonIndex]; } if (tendonIndex == 0) articulation.data.gpuDirtyFlag |= Dy::ArticulationDirtyFlag::eDIRTY_FIXED_TENDON; break; } default: assert(false); } } } extern "C" __global__ void getArtiTendonState( const PxgArticulationCoreDesc* PX_RESTRICT scDesc, void* PX_RESTRICT data, const PxArticulationGPUIndex* PX_RESTRICT gpuIndices, const PxU32 nbElements, const PxU32 maxTendons, const PxArticulationGPUAPIReadType::Enum operation ) { const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x; const PxU32 groupIndex = globalThreadIndex / maxTendons; const PxU32 tendonIndex = globalThreadIndex % maxTendons; if (groupIndex < nbElements) { const PxArticulationGPUIndex articulationIndex = gpuIndices[groupIndex]; const PxgArticulation& articulation = scDesc->articulations[articulationIndex]; switch (operation) { case (PxArticulationGPUAPIReadType::eSPATIAL_TENDON): { const PxU32 numSpatialTendons = articulation.data.numSpatialTendons; if (tendonIndex < numSpatialTendons) { PxGpuSpatialTendonData* dstData = &(reinterpret_cast(data)[groupIndex * maxTendons]); dstData[tendonIndex] = articulation.spatialTendonParams[tendonIndex]; } break; } case (PxArticulationGPUAPIReadType::eFIXED_TENDON): { const PxU32 numFixedTendons = articulation.data.numFixedTendons; if (tendonIndex < numFixedTendons) { PxGpuFixedTendonData* dstData = &(reinterpret_cast(data)[groupIndex * maxTendons]); dstData[tendonIndex] = articulation.fixedTendonParams[tendonIndex]; } break; } default: assert(false); } } } extern "C" __global__ void setArtiSpatialTendonAttachmentState( const PxgArticulationCoreDesc* PX_RESTRICT scDesc, const PxGpuTendonAttachmentData* PX_RESTRICT data, const PxArticulationGPUIndex* PX_RESTRICT gpuIndices, const PxU32 nbElements, const PxU32 maxTendons ) { const PxU32 maxTendonAttachments = scDesc->mMaxAttachmentPerArticulation; const PxU32 maxSpatialTendons = scDesc->mMaxSpatialTendonsPerArticulation; const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x; const PxU32 groupIndex = globalThreadIndex / maxTendons; const PxU32 elementIndex = globalThreadIndex % maxTendons; const PxU32 tendonIndex = elementIndex / maxTendonAttachments; const PxU32 attachmentIndex = elementIndex % maxTendonAttachments; if (groupIndex < nbElements) { const PxArticulationGPUIndex articulationIndex = gpuIndices[groupIndex]; PxgArticulation& articulation = scDesc->articulations[articulationIndex]; if (elementIndex == 0) articulation.data.gpuDirtyFlag |= Dy::ArticulationDirtyFlag::eDIRTY_SPATIAL_TENDON_ATTACHMENT; const PxU32 numSpatialTendons = articulation.data.numSpatialTendons; if (tendonIndex < numSpatialTendons) { PxgArticulationTendon& tendon = articulation.spatialTendons[tendonIndex]; const PxGpuTendonAttachmentData* srcData = &data[groupIndex * maxSpatialTendons * maxTendonAttachments]; PxGpuTendonAttachmentData* attachData = reinterpret_cast(tendon.mModElements); const PxU32 numTendonAttachments = tendon.mNbElements; if (attachmentIndex < numTendonAttachments) { //PxGpuTendonAttachmentData is 32 byte PX_COMPILE_TIME_ASSERT(sizeof(PxGpuTendonAttachmentData) == 32); const PxU32 numIteration = sizeof(PxGpuTendonAttachmentData) / sizeof(uint4); const uint4* tData = reinterpret_cast(&srcData[attachmentIndex]); uint4* aData = reinterpret_cast(&attachData[attachmentIndex]); for (PxU32 i = 0; i < numIteration; ++i) { aData[i] = tData[i]; } } } } } extern "C" __global__ void getArtiSpatialTendonAttachmentState( const PxgArticulationCoreDesc* PX_RESTRICT scDesc, PxGpuTendonAttachmentData* PX_RESTRICT data, const PxArticulationGPUIndex* PX_RESTRICT gpuIndices, const PxU32 nbElements, const PxU32 maxTendons ) { const PxU32 maxTendonAttachments = scDesc->mMaxAttachmentPerArticulation; const PxU32 maxSpatialTendons = scDesc->mMaxSpatialTendonsPerArticulation; const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x; const PxU32 groupIndex = globalThreadIndex / maxTendons; const PxU32 elementIndex = globalThreadIndex % maxTendons; const PxU32 tendonIndex = elementIndex / maxTendonAttachments; const PxU32 attachmentIndex = elementIndex % maxTendonAttachments; if (groupIndex < nbElements) { const PxArticulationGPUIndex articulationIndex = gpuIndices[groupIndex]; const PxgArticulation& articulation = scDesc->articulations[articulationIndex]; const PxU32 numSpatialTendons = articulation.data.numSpatialTendons; if (tendonIndex < numSpatialTendons) { const PxgArticulationTendon& tendon = articulation.spatialTendons[tendonIndex]; PxGpuTendonAttachmentData* dstData = &data[groupIndex * maxSpatialTendons * maxTendonAttachments]; const PxGpuTendonAttachmentData* attachData = reinterpret_cast(tendon.mModElements); const PxU32 numTendonAttachments = tendon.mNbElements; if (attachmentIndex < numTendonAttachments) { //PxGpuTendonAttachmentData is 32 byte PX_COMPILE_TIME_ASSERT(sizeof(PxGpuTendonAttachmentData) == 32); const PxU32 numIteration = sizeof(PxGpuTendonAttachmentData) / sizeof(uint4); uint4* tData = reinterpret_cast(&dstData[attachmentIndex]); const uint4* aData = reinterpret_cast(&attachData[attachmentIndex]); for (PxU32 i = 0; i < numIteration; ++i) { tData[i] = aData[i]; } } } } } extern "C" __global__ void setArtiFixedTendonJointState( const PxgArticulationCoreDesc* PX_RESTRICT scDesc, const PxGpuTendonJointCoefficientData* PX_RESTRICT data, const PxArticulationGPUIndex* PX_RESTRICT gpuIndices, const PxU32 nbElements ) { const PxU32 maxFixedTendons = scDesc->mMaxFixedTendonsPerArticulation; const PxU32 maxTendonJoints = scDesc->mMaxTendonJointPerArticulation; const PxU32 maxTendons = maxFixedTendons * maxTendonJoints; const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x; const PxU32 groupIndex = globalThreadIndex / maxTendons; const PxU32 elementIndex = globalThreadIndex % maxTendons; const PxU32 tendonIndex = elementIndex / maxTendonJoints; const PxU32 tendonJointIndex = elementIndex % maxTendonJoints; if (groupIndex < nbElements) { const PxArticulationGPUIndex articulationIndex = gpuIndices[groupIndex]; PxgArticulation& articulation = scDesc->articulations[articulationIndex]; if (elementIndex == 0) articulation.data.gpuDirtyFlag |= Dy::ArticulationDirtyFlag::eDIRTY_FIXED_TENDON_JOINT; PxgArticulationTendon* fixedTendons = articulation.fixedTendons; const PxU32 numFixedTendonJoints = articulation.data.numFixedTendons; if (tendonIndex < numFixedTendonJoints) { PxgArticulationTendon& tendon = fixedTendons[tendonIndex]; const PxGpuTendonJointCoefficientData* srcData = &data[groupIndex * maxFixedTendons * maxTendonJoints]; PxGpuTendonJointCoefficientData* coefficientData = reinterpret_cast(tendon.mModElements); const PxU32 numTendonJoints = tendon.mNbElements; if (tendonJointIndex < numTendonJoints) { //PxGpuTendonJointCoefficientData is 16 byte PX_COMPILE_TIME_ASSERT(sizeof(PxGpuTendonJointCoefficientData) == 16); const uint4 tData = reinterpret_cast(srcData[tendonJointIndex]); uint4& coefData = reinterpret_cast(coefficientData[tendonJointIndex]); coefData = tData; } } } } extern "C" __global__ void getArtiFixedTendonJointState( const PxgArticulationCoreDesc* PX_RESTRICT scDesc, PxGpuTendonJointCoefficientData* PX_RESTRICT data, const PxArticulationGPUIndex* PX_RESTRICT gpuIndices, const PxU32 nbElements ) { const PxU32 maxFixedTendons = scDesc->mMaxFixedTendonsPerArticulation; const PxU32 maxTendonJoints = scDesc->mMaxTendonJointPerArticulation; const PxU32 maxTendons = maxFixedTendons * maxTendonJoints; const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x; const PxU32 groupIndex = globalThreadIndex / maxTendons; const PxU32 elementIndex = globalThreadIndex % maxTendons; const PxU32 tendonIndex = elementIndex / maxTendonJoints; const PxU32 tendonJointIndex = elementIndex % maxTendonJoints; if (groupIndex < nbElements) { const PxArticulationGPUIndex articulationIndex = gpuIndices[groupIndex]; const PxgArticulation& articulation = scDesc->articulations[articulationIndex]; const PxgArticulationTendon* fixedTendons = articulation.fixedTendons; const PxU32 numFixedTendonJoints = articulation.data.numFixedTendons; if (tendonIndex < numFixedTendonJoints) { const PxgArticulationTendon& tendon = fixedTendons[tendonIndex]; PxGpuTendonJointCoefficientData* dstData = &data[groupIndex * maxFixedTendons * maxTendonJoints]; const PxGpuTendonJointCoefficientData* coefficientData = reinterpret_cast(tendon.mModElements); const PxU32 numTendonJoints = tendon.mNbElements; if (tendonJointIndex < numTendonJoints) { //PxGpuTendonJointCoefficientData is 16 byte PX_COMPILE_TIME_ASSERT(sizeof(PxGpuTendonJointCoefficientData) == 16); uint4& tData = reinterpret_cast(dstData[tendonJointIndex]); const uint4& coefData = reinterpret_cast(coefficientData[tendonJointIndex]); tData = coefData; } } } }