// Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved. // Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. // Copyright (c) 2001-2004 NovodeX AG. All rights reserved. #include "PxgArticulationLink.h" #include "DyArticulationJointCore.h" #include "DyFeatherstoneArticulationUtils.h" #include "DyFeatherstoneArticulationJointData.h" #include "DyArticulationCore.h" #include "PxgArticulationCoreKernelIndices.h" #include "PxgBodySim.h" #include "reduction.cuh" #include "copy.cuh" #include "articulationDynamic.cuh" #include "articulationImpulseResponse.cuh" #include "MemoryAllocator.cuh" #include "PxSpatialMatrix.h" #include "foundation/PxMath.h" #include "DyFeatherstoneArticulation.h" #include "PxConstraintDesc.h" #include "DyConstraintPrep.h" #include "PxsRigidBody.h" #include using namespace physx; using namespace Dy; extern "C" __host__ void initArticulationKernels4() {} // VR: Old version for reference #if 0 // This is a simple port of FeatherstoneArticulation::getDenseJacobian. // We process one articulation per thread, which is not very efficient. extern "C" __global__ void computeArtiDenseJacobians( const PxIndexDataPair* indexMap, const PxU32 nbIndices, const PxgArticulation* articulations) { const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x; if (globalThreadIndex < nbIndices) { PxU32 artiIndex = indexMap[globalThreadIndex].index; const PxgArticulation& arti = articulations[artiIndex]; PxU32 linkCount = arti.data.numLinks; //PxU32 jointCount = linkCount - 1; PxU32 dofCount = arti.data.numJointDofs; bool fixBase = arti.data.flags & PxArticulationFlag::eFIX_BASE; PxU32 nCols = (fixBase ? 0 : 6) + dofCount; //PxU32 nRows = (fixBase ? 0 : 6) + jointCount * 6; //printf("~!~!~! processing arti %u (%u): %u links, %u dofs, fixed? %u, %u rows, %u cols\n", // globalThreadIndex, artiIndex, linkCount, dofCount, PxU32(fixBase), nRows, nCols); float* dataPtr = static_cast(indexMap[globalThreadIndex].data); #define jacobian(row, col) dataPtr[nCols * (row) + (col)] PxU32 destRow = 0; PxU32 destCol = 0; if (!fixBase) { jacobian(0, 0) = 1.0f; jacobian(0, 1) = 0.0f; jacobian(0, 2) = 0.0f; jacobian(0, 3) = 0.0f; jacobian(0, 4) = 0.0f; jacobian(0, 5) = 0.0f; jacobian(1, 0) = 0.0f; jacobian(1, 1) = 1.0f; jacobian(1, 2) = 0.0f; jacobian(1, 3) = 0.0f; jacobian(1, 4) = 0.0f; jacobian(1, 5) = 0.0f; jacobian(2, 0) = 0.0f; jacobian(2, 1) = 0.0f; jacobian(2, 2) = 1.0f; jacobian(2, 3) = 0.0f; jacobian(2, 4) = 0.0f; jacobian(2, 5) = 0.0f; jacobian(3, 0) = 0.0f; jacobian(3, 1) = 0.0f; jacobian(3, 2) = 0.0f; jacobian(3, 3) = 1.0f; jacobian(3, 4) = 0.0f; jacobian(3, 5) = 0.0f; jacobian(4, 0) = 0.0f; jacobian(4, 1) = 0.0f; jacobian(4, 2) = 0.0f; jacobian(4, 3) = 0.0f; jacobian(4, 4) = 1.0f; jacobian(4, 5) = 0.0f; jacobian(5, 0) = 0.0f; jacobian(5, 1) = 0.0f; jacobian(5, 2) = 0.0f; jacobian(5, 3) = 0.0f; jacobian(5, 4) = 0.0f; jacobian(5, 5) = 1.0f; destRow += 6; destCol += 6; } for (PxU32 linkID = 1; linkID < linkCount; ++linkID)//each iteration of this writes 6 rows in the matrix { //const PxgArticulationLink& link = arti.links[linkID]; const PxTransform& body2World = arti.linkBody2Worlds[linkID]; const ArticulationJointCoreData& jointDatum = arti.jointData[linkID]; const PxU32 parentLinkID = arti.parents[linkID]; if (parentLinkID || !fixBase) { // VR: It arti.jointData[0] isn't initialized //const PxU32 parentsFirstDestCol = arti.jointData[parentLinkID].jointOffset + (fixBase ? 0 : 6); const PxU32 parentsLastDestCol = parentLinkID ? arti.jointData[parentLinkID].jointOffset + (fixBase ? 0 : 6) + arti.jointData[parentLinkID].dof : 6; // VR: For parentLinkID == 0, it's two unsigned it overflows, though the result is still correct. const PxU32 parentsDestRow = (fixBase ? 0 : 6) + (parentLinkID - 1) * 6; // no commonInit, so need to compute the rw vector here const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLinkID]; const PxVec3 rw = body2World.p - parentBody2World.p; // VR: parentsLastDestCol is rather column count than the last column index //for (PxU32 col = 0; col <= parentsLastDestCol; col++) for (PxU32 col = 0; col < parentsLastDestCol; col++) { //copy downward the 6 cols from parent const PxVec3 parentAng(jacobian(parentsDestRow + 3, col), jacobian(parentsDestRow + 4, col), jacobian(parentsDestRow + 5, col)); const PxVec3 parentAngxRw = parentAng.cross(rw); jacobian(destRow + 0, col) = jacobian(parentsDestRow + 0, col) + parentAngxRw.x; jacobian(destRow + 1, col) = jacobian(parentsDestRow + 1, col) + parentAngxRw.y; jacobian(destRow + 2, col) = jacobian(parentsDestRow + 2, col) + parentAngxRw.z; jacobian(destRow + 3, col) = parentAng.x; jacobian(destRow + 4, col) = parentAng.y; jacobian(destRow + 5, col) = parentAng.z; } // VR: parentsLastDestCol is rather column count than the last column index //for (PxU32 col = parentsLastDestCol + 1; col < destCol; col++) for (PxU32 col = parentsLastDestCol; col < destCol; col++) { //fill with zeros. jacobian(destRow + 0, col) = 0.0f; jacobian(destRow + 1, col) = 0.0f; jacobian(destRow + 2, col) = 0.0f; jacobian(destRow + 3, col) = 0.0f; jacobian(destRow + 4, col) = 0.0f; jacobian(destRow + 5, col) = 0.0f; } } //diagonal block: for (PxU32 ind = 0; ind < jointDatum.dof; ++ind) { const Cm::UnAlignedSpatialVector& v = arti.motionMatrix[linkID][ind]; const PxVec3 ang = body2World.rotate(v.top); const PxVec3 lin = body2World.rotate(v.bottom); jacobian(destRow + 0, destCol) = lin.x; jacobian(destRow + 1, destCol) = lin.y; jacobian(destRow + 2, destCol) = lin.z; jacobian(destRow + 3, destCol) = ang.x; jacobian(destRow + 4, destCol) = ang.y; jacobian(destRow + 5, destCol) = ang.z; destCol++; } //above diagonal block: always zero for (PxU32 col = destCol; col < nCols; col++) { jacobian(destRow + 0, col) = 0.0f; jacobian(destRow + 1, col) = 0.0f; jacobian(destRow + 2, col) = 0.0f; jacobian(destRow + 3, col) = 0.0f; jacobian(destRow + 4, col) = 0.0f; jacobian(destRow + 5, col) = 0.0f; } destRow += 6; } #undef jacobian } } #endif extern "C" __global__ void computeArtiDenseJacobians( float* PX_RESTRICT data, const PxArticulationGPUIndex* PX_RESTRICT gpuIndices, const PxU32 nbIndices, const PxU32 maxLinks, const PxU32 maxDofs, const PxgArticulation* PX_RESTRICT articulations) { const PxU32 jobIndex = threadIdx.y + blockIdx.x * blockDim.y; const PxU32 threadIndex = threadIdx.x; if (jobIndex < nbIndices) { const PxArticulationGPUIndex artiIndex = gpuIndices[jobIndex]; const PxgArticulation& arti = articulations[artiIndex]; const PxU32 linkCount = arti.data.numLinks; const PxU32 dofCount = arti.data.numJointDofs; const bool fixedBase = arti.data.flags & PxArticulationFlag::eFIX_BASE; const PxU32 baseDofs = fixedBase ? 0 : 6; const PxU32 colCount = baseDofs + dofCount; const PxU32 maxCols = 6 + maxDofs; const PxU32 maxRows = 6 + (maxLinks - 1) * 6; float* PX_RESTRICT jacobian = &data[jobIndex * maxCols * maxRows]; if (!fixedBase) { for (PxU32 i = threadIndex; i < 6 * colCount; i += WARP_SIZE) jacobian[i] = (i / colCount == i % colCount) ? 1.0f : 0.0f; __syncwarp(); } for (PxU32 link = 1; link < linkCount; ++link) { const PxU32 linkOffset = arti.jointData[link].jointOffset + baseDofs; const PxU32 linkDofs = arti.jointData[link].nbDof; const PxU32 linkRow0 = (link - 1) * 6 + baseDofs; const PxTransform& body2World = arti.linkBody2Worlds[link]; const PxU32 parentLink = arti.parents[link]; const PxU32 parentDofs = parentLink ? arti.jointData[parentLink].jointOffset + arti.jointData[parentLink].nbDof + baseDofs : baseDofs; for (PxU32 dof = threadIndex; dof < colCount; dof += WARP_SIZE) { PxVec3 linkLin(0), linkAng(0); if (dof < parentDofs) { const PxU32 parentRow0 = parentLink ? (parentLink - 1) * 6 + baseDofs : 0; const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink]; const PxVec3 rw = body2World.p - parentBody2World.p; const PxVec3 parentLin(jacobian[(parentRow0 + 0) * colCount + dof], jacobian[(parentRow0 + 1) * colCount + dof], jacobian[(parentRow0 + 2) * colCount + dof]); const PxVec3 parentAng(jacobian[(parentRow0 + 3) * colCount + dof], jacobian[(parentRow0 + 4) * colCount + dof], jacobian[(parentRow0 + 5) * colCount + dof]); const PxVec3 parentAngxRw = parentAng.cross(rw); linkLin = parentLin + parentAngxRw; linkAng = parentAng; } else if (dof >= linkOffset && dof < linkOffset + linkDofs) { const Cm::UnAlignedSpatialVector& v = arti.motionMatrix[link][dof - linkOffset]; linkLin = body2World.rotate(v.bottom); linkAng = body2World.rotate(v.top); } __syncwarp(); jacobian[(linkRow0 + 0) * colCount + dof] = linkLin.x; jacobian[(linkRow0 + 1) * colCount + dof] = linkLin.y; jacobian[(linkRow0 + 2) * colCount + dof] = linkLin.z; jacobian[(linkRow0 + 3) * colCount + dof] = linkAng.x; jacobian[(linkRow0 + 4) * colCount + dof] = linkAng.y; jacobian[(linkRow0 + 5) * colCount + dof] = linkAng.z; } } } } __device__ static void translateInertia(const PxMat33& sTod, Dy::SpatialMatrix& inertia) { const PxMat33 dTos = sTod.getTranspose(); PxMat33 bl = sTod * inertia.topLeft + inertia.bottomLeft; PxMat33 br = sTod * inertia.topRight + inertia.getBottomRight(); inertia.topLeft = inertia.topLeft + inertia.topRight * dTos; inertia.bottomLeft = bl + br * dTos; //aligned inertia - make it symmetrical! OPTIONAL!!!! inertia.bottomLeft = (inertia.bottomLeft + inertia.bottomLeft.getTranspose()) * 0.5f; } // This is a simple port of FeatherstoneArticulation::getGeneralizedMassMatrixCRB. // I process one articulation per warp (32 threads). extern "C" __global__ void computeArtiMassMatrices( const PxU32 nbIndices, float* PX_RESTRICT data, const PxArticulationGPUIndex* PX_RESTRICT gpuIndices, const PxU32 maxDofs, const bool rootMotion, const PxgArticulation* PX_RESTRICT articulations) { const PxU32 jobIndex = threadIdx.y + blockIdx.x * blockDim.y; const PxU32 threadIndex = threadIdx.x; if (jobIndex < nbIndices) { const PxU32 artiIndex = gpuIndices[jobIndex]; const PxgArticulation& arti = articulations[artiIndex]; // I use this as a tmp buffer to store world space spatial inertias of all links Dy::SpatialMatrix* PX_RESTRICT spatialInertias = reinterpret_cast(arti.worldSpatialArticulatedInertia); // ??? // I use this as a tmp buffer to store world space motiom vectors multiplied by their links inertias Cm::UnAlignedSpatialVector* PX_RESTRICT worldMotionMatrices = reinterpret_cast(arti.worldMotionMatrix); // ??? const PxU32 linkCount = arti.data.numLinks; const PxU32 dofCount = arti.data.numJointDofs; const bool fixBase = arti.data.flags & PxArticulationFlag::eFIX_BASE; const PxU32 rootDof = (rootMotion && !fixBase) ? 6 : 0; // Add the DoF of the root in the floating base case const PxU32 bufferDof = rootMotion ? 6 : 0; const PxU32 matSize = dofCount + rootDof; float* massMatrix = &data[jobIndex * (maxDofs + bufferDof) * (maxDofs + bufferDof)]; for (int i = threadIndex; i < matSize * matSize; i += WARP_SIZE) massMatrix[i] = 0; __syncwarp(); for (PxU32 link = threadIndex; link < linkCount; link += WARP_SIZE) { const float4 invIM = arti.linkProps[link].invInertiaXYZ_invMass; const PxReal mass = invIM.w == 0.f ? 0.f : (1.f / invIM.w); const PxVec3 localInertia = PxVec3(invIM.x == 0.f ? 0.f : (1.f / invIM.x), invIM.y == 0.f ? 0.f : (1.f / invIM.y), invIM.z == 0.f ? 0.f : (1.f / invIM.z)); Dy::SpatialMatrix spatialInertia; spatialInertia.topLeft = PxMat33(PxZero); spatialInertia.topRight = PxMat33::createDiagonal(PxVec3(mass)); spatialInertia.padding = 0; // VR: I use this to store the order to compute combined inertias Cm::transformInertiaTensor(localInertia, PxMat33(arti.linkBody2Worlds[link].q), spatialInertia.bottomLeft); spatialInertias[link] = spatialInertia; } __syncwarp(); #if 1 // VR: This optimization allows computing some of combined inertias in parallel. It's a bit faster for humanoids, but not much. PxU32 maxStep = 0; if (threadIndex == 0) { for (PxU32 link = linkCount - 1; link > 0; --link) { PxU32& s = spatialInertias[link].padding; const PxU32 parentLink = arti.parents[link]; PxU32& ps = spatialInertias[parentLink].padding; if (s < ps) s = ps; ps = s + 1; if (maxStep < ps) maxStep = ps; } } __syncwarp(); maxStep = __shfl_sync(0xffffffff, maxStep, 0); for (PxU32 step = 0; step <= maxStep; ++step) { for (PxU32 link = threadIndex + 1; link < linkCount; link += WARP_SIZE) { const PxU32& s = spatialInertias[link].padding; if (s == step) { const PxTransform& body2World = arti.linkBody2Worlds[link]; const PxU32 parentLink = arti.parents[link]; const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink]; const PxVec3 rw = body2World.p - parentBody2World.p; PxMat33 skewSymmetric(PxVec3(0, rw.z, -rw.y), PxVec3(-rw.z, 0, rw.x), PxVec3(rw.y, -rw.x, 0)); Dy::SpatialMatrix parentSpaceSpatialInertia = spatialInertias[link]; translateInertia(skewSymmetric, parentSpaceSpatialInertia); spatialInertias[parentLink] += parentSpaceSpatialInertia; } } __syncwarp(); } #else if (threadIndex == 0) // @@@ ??? Can I optimize it somehow? { for (PxU32 link = linkCount - 1; link > 0; --link) { const PxTransform& body2World = arti.linkBody2Worlds[link]; const PxU32 parentLink = arti.parents[link]; const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink]; const PxVec3 rw = body2World.p - parentBody2World.p; PxMat33 skewSymmetric(PxVec3(0, rw.z, -rw.y), PxVec3(-rw.z, 0, rw.x), PxVec3(rw.y, -rw.x, 0)); Dy::SpatialMatrix parentSpaceSpatialInertia = spatialInertias[link]; translateInertia(skewSymmetric, parentSpaceSpatialInertia); spatialInertias[parentLink] += parentSpaceSpatialInertia; } } __syncwarp(); #endif // Loop is for the case where there are more than 32 links in one articulation for (PxU32 link = threadIndex + 1; link < linkCount; link += WARP_SIZE) { const PxTransform& body2World = arti.linkBody2Worlds[link]; const ArticulationJointCoreData& jointData = arti.jointData[link]; const Dy::SpatialMatrix& spatialInertia = spatialInertias[link]; Cm::UnAlignedSpatialVector worldMotionMatrix[3]; Cm::UnAlignedSpatialVector* spatialInertia_worldMotionMatrix = &worldMotionMatrices[jointData.jointOffset]; for (PxU32 dof = 0; dof < jointData.nbDof; ++dof) { const Cm::UnAlignedSpatialVector& m = arti.motionMatrix[link][dof]; worldMotionMatrix[dof] = m.rotate(body2World); spatialInertia_worldMotionMatrix[dof] = spatialInertia * worldMotionMatrix[dof]; } // A unit joint acceleration is assumed for each dof of the considered link // This calculates the diagonal terms using the spatial articulated inertia for obtaining // the force applied to the considered dof due to the applied unit joint acceleration for (PxU32 dof0 = 0; dof0 < jointData.nbDof; ++dof0) { const Cm::UnAlignedSpatialVector& Im0 = spatialInertia_worldMotionMatrix[dof0]; const PxU32 row = jointData.jointOffset + dof0 + rootDof; for (PxU32 dof1 = 0; dof1 < jointData.nbDof; ++dof1) { const Cm::UnAlignedSpatialVector& m1 = worldMotionMatrix[dof1]; const PxU32 col = jointData.jointOffset + dof1 + rootDof; massMatrix[row * matSize + col] = m1.innerProduct(Im0); } } // The calculated force is then propagated inward to calculate the above diagonal terms of the mass matrix // This is sufficient as the mass matrix is symmetric PxTransform childBody2World = body2World; PxU32 parentLink = arti.parents[link]; while (true) { const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink]; // The zero acceleration force is propagated inward // This is correct because all links inward the considered link have no joint acceleration const PxVec3 rw = childBody2World.p - parentBody2World.p; for (PxU32 dof = 0; dof < jointData.nbDof; ++dof) { spatialInertia_worldMotionMatrix[dof] = FeatherstoneArticulation::translateSpatialVector(rw, spatialInertia_worldMotionMatrix[dof]); } childBody2World = parentBody2World; if (parentLink == 0) { // For floating base, calculate the resulting force on the root if (!fixBase && rootMotion) { for (PxU32 dof0 = 0; dof0 < jointData.nbDof; ++dof0) { const Cm::UnAlignedSpatialVector& Im0 = spatialInertia_worldMotionMatrix[dof0]; const PxU32 col = jointData.jointOffset + dof0 + rootDof; for (PxU32 row = 0; row < 6; ++row) { massMatrix[col * matSize + row] += Im0[row]; massMatrix[col + row * matSize] += Im0[row]; } } } break; } const ArticulationJointCoreData& parentJointData = arti.jointData[parentLink]; Cm::UnAlignedSpatialVector parentWorldMotionMatrix[3]; for (PxU32 dof = 0; dof < parentJointData.nbDof; ++dof) { const Cm::UnAlignedSpatialVector& m = arti.motionMatrix[parentLink][dof]; parentWorldMotionMatrix[dof].top = parentBody2World.rotate(m.top); parentWorldMotionMatrix[dof].bottom = parentBody2World.rotate(m.bottom); } // The joint force is calculated from the zero acceleration force for (PxU32 dof0 = 0; dof0 < jointData.nbDof; ++dof0) { const Cm::UnAlignedSpatialVector& Im0 = spatialInertia_worldMotionMatrix[dof0]; const PxU32 row = jointData.jointOffset + dof0 + rootDof; for (PxU32 dof1 = 0; dof1 < parentJointData.nbDof; ++dof1) { const Cm::UnAlignedSpatialVector& m1 = parentWorldMotionMatrix[dof1]; const PxU32 col = parentJointData.jointOffset + dof1 + rootDof; float m = m1.innerProduct(Im0); massMatrix[row * matSize + col] = m; massMatrix[row + col * matSize] = m; } } parentLink = arti.parents[parentLink]; } } // Adding the spatial articulated inertia of the root if (threadIndex == 0 && !fixBase && rootMotion) { // Note that the spatial articulated inertia assumes that the root angular acceleration comes first, // while the mass matrix assumes that the root linear acceleration comes first // We have therefore to invert the angular and linear component of the spatial articulated inertia // This also ensures that the mass matrix is symmetric const PxReal* rootSpatialInertia = reinterpret_cast(&spatialInertias[0]); for (PxU32 row = 0; row < 6; ++row) { const PxU32 rowSpatialInertia = (row < 3) ? row : row - 3; // Convert to the index of a 3 x 3 matrix // Only process elements above the diagonal as the matrix is symmetric for (PxU32 col = row; col < 6; ++col) { // This offset is due to how the spatial matrix is indexed and how the linear amd angular components // of the acceleration should be inverted, the index is as follows // 0 3 6 9 12 15 9 12 15 0 3 6 // 1 4 7 10 13 16 10 23 16 1 4 7 // 2 5 8 11 14 17 inversion linear/angular 11 24 17 2 5 8 // 18 21 24 0 1 2 ------------------------> 0 1 2 18 21 24 // 19 22 25 3 4 5 3 4 5 19 22 25 // 20 23 26 6 7 8 6 7 8 20 23 26 const PxU32 offset = (row > 2) ? 18 : (col < 3) * 9; const PxU32 colSpatialInertia = (col < 3) ? col : col - 3; // Convert to the index of a 3 x 3 matrix const PxU32 index = offset + colSpatialInertia * 3 + rowSpatialInertia; massMatrix[row * matSize + col] = rootSpatialInertia[index]; massMatrix[col * matSize + row] = rootSpatialInertia[index]; } } } else if (!fixBase && !rootMotion) { __syncwarp(); Dy::SpatialMatrix baseInvI = spatialInertias[0].invertInertia(); for (PxU32 row = threadIndex; row < dofCount; row += WARP_SIZE) { const Cm::UnAlignedSpatialVector& m0 = worldMotionMatrices[row]; for (PxU32 col = 0; col < dofCount; ++col) { const Cm::UnAlignedSpatialVector& m1 = worldMotionMatrices[col]; massMatrix[row * dofCount + col] -= m0.innerProduct(baseInvI * m1); } } } __syncwarp(); } } __device__ static Cm::UnAlignedSpatialVector translateSpatialVector(const PxVec3& offset, const Cm::UnAlignedSpatialVector& vec) { return Cm::UnAlignedSpatialVector(vec.top, vec.bottom + offset.cross(vec.top)); } // This is an optimized port of FeatherstoneArticulation::getGeneralizedGravityForce. // We process one articulation per warp (32 threads). extern "C" __global__ void computeArtiGravityForces( const PxU32 nbIndices, float* PX_RESTRICT data, const PxArticulationGPUIndex* PX_RESTRICT gpuIndices, const PxU32 maxDofs, const bool rootMotion, const PxgArticulation* articulations, const PxVec3 gravity) { const PxU32 jobIndex = threadIdx.y + blockIdx.x * blockDim.y; const PxU32 threadIndex = threadIdx.x; if (jobIndex < nbIndices) { const PxU32 artiIndex = gpuIndices[jobIndex]; const PxgArticulation& arti = articulations[artiIndex]; const PxU32 linkCount = arti.data.numLinks; const PxU32 dofCount = arti.data.numJointDofs; const bool fixBase = arti.data.flags & PxArticulationFlag::eFIX_BASE; const PxU32 rootDof = (rootMotion && !fixBase) ? 6 : 0; // Add the DoF of the root in the floating base case const PxU32 bufferDof = rootMotion ? 6 : 0; const PxVec3 tGravity = -gravity; float* PX_RESTRICT gravityCompensationForces = &data[jobIndex * (maxDofs + bufferDof)]; if (rootMotion || fixBase) { // I use this as a tmp buffer to store ZAForce vectors of all links Cm::UnAlignedSpatialVector* PX_RESTRICT zAForces = reinterpret_cast(arti.zAForces); // ??? for (PxU32 link = threadIndex; link < linkCount; link += WARP_SIZE) { const float4 invIM = arti.linkProps[link].invInertiaXYZ_invMass; const PxReal mass = invIM.w == 0.f ? 0.f : (1.f / invIM.w); zAForces[link].top = tGravity * mass; zAForces[link].bottom = PxVec3(0); } __syncwarp(); for (PxU32 link = (linkCount - 1); link > 0; --link) { const PxTransform& body2World = arti.linkBody2Worlds[link]; const PxU32 parentLink = arti.parents[link]; const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink]; if (threadIndex == 0) zAForces[parentLink] += translateSpatialVector(body2World.p - parentBody2World.p, zAForces[link]); const ArticulationJointCoreData& jointData = arti.jointData[link]; PxReal* force = &gravityCompensationForces[jointData.jointOffset]; if (threadIndex < jointData.nbDof) force[threadIndex + rootDof] = link; // I'll just store dof's link index here and use it in the next loop } __syncwarp(); for (PxU32 dof = threadIndex; dof < dofCount; dof += WARP_SIZE) { PxU32 link = (PxU32)gravityCompensationForces[dof + rootDof]; const ArticulationJointCoreData& jointData = arti.jointData[link]; const PxTransform& body2World = arti.linkBody2Worlds[link]; Cm::UnAlignedSpatialVector dofMotion = arti.motionMatrix[link][dof - jointData.jointOffset].rotate(body2World); gravityCompensationForces[dof + rootDof] = dofMotion.innerProduct(zAForces[link]); } // Add root DoFs contribution if (threadIndex == 0 && rootDof == 6) { gravityCompensationForces[0] = zAForces[0].top.x; gravityCompensationForces[1] = zAForces[0].top.y; gravityCompensationForces[2] = zAForces[0].top.z; gravityCompensationForces[3] = zAForces[0].bottom.x; gravityCompensationForces[4] = zAForces[0].bottom.y; gravityCompensationForces[5] = zAForces[0].bottom.z; } __syncwarp(); } else { // I use this as a tmp buffer to store world space spatial inertias of all links Dy::SpatialMatrix* PX_RESTRICT spatialInertias = reinterpret_cast(arti.worldSpatialArticulatedInertia); // ??? // I use this as a tmp buffer to store motion velocities of all links ??? Or not. The velocities seem valid. Maybe I don't need to compute them ??? Cm::UnAlignedSpatialVector* PX_RESTRICT motionVelocities = reinterpret_cast(arti.motionVelocities); // ??? // I use this as a tmp buffer to store motion accelerations of all links Cm::UnAlignedSpatialVector* PX_RESTRICT motionAccelerations = reinterpret_cast(arti.motionAccelerations); // ??? // I use this as a tmp buffer to store ZAForce vectors of all links Cm::UnAlignedSpatialVector* PX_RESTRICT zAForces = reinterpret_cast(arti.zAForces); // ??? // Compute links inertias for (PxU32 link = threadIndex; link < linkCount; link += WARP_SIZE) { const float4 invIM = arti.linkProps[link].invInertiaXYZ_invMass; const PxReal mass = invIM.w == 0.f ? 0.f : (1.f / invIM.w); const PxVec3 localInertia = PxVec3(invIM.x == 0.f ? 0.f : (1.f / invIM.x), invIM.y == 0.f ? 0.f : (1.f / invIM.y), invIM.z == 0.f ? 0.f : (1.f / invIM.z)); Dy::SpatialMatrix spatialInertia; spatialInertia.topLeft = PxMat33(PxZero); spatialInertia.topRight = PxMat33::createDiagonal(PxVec3(mass)); Cm::transformInertiaTensor(localInertia, PxMat33(arti.linkBody2Worlds[link].q), spatialInertia.bottomLeft); spatialInertias[link] = spatialInertia; } __syncwarp(); if (threadIndex == 0) // @@@ ??? Can I optimize it somehow? { for (PxU32 link = linkCount - 1; link > 0; --link) { const PxTransform& body2World = arti.linkBody2Worlds[link]; const PxU32 parentLink = arti.parents[link]; const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink]; const PxVec3 rw = body2World.p - parentBody2World.p; PxMat33 skewSymmetric(PxVec3(0, rw.z, -rw.y), PxVec3(-rw.z, 0, rw.x), PxVec3(rw.y, -rw.x, 0)); Dy::SpatialMatrix parentSpaceSpatialInertia = spatialInertias[link]; translateInertia(skewSymmetric, parentSpaceSpatialInertia); spatialInertias[parentLink] += parentSpaceSpatialInertia; } } __syncwarp(); // Compute motion velocities ??? Should I even do this? if (threadIndex == 0) { for (PxU32 link = 1; link < linkCount; ++link) { const PxTransform& body2World = arti.linkBody2Worlds[link]; const PxU32 parentLink = arti.parents[link]; const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink]; motionVelocities[link] = translateSpatialVector(parentBody2World.p - body2World.p, motionVelocities[parentLink]); } } __syncwarp(); // Init za-forces for (PxU32 link = threadIndex; link < linkCount; link += WARP_SIZE) { const float4 invIM = arti.linkProps[link].invInertiaXYZ_invMass; const PxReal mass = invIM.w == 0.f ? 0.f : (1.f / invIM.w); const PxMat33& I = spatialInertias[link].bottomLeft; const PxVec3& vA = motionVelocities[link].top; zAForces[link].top = -tGravity * mass; // ??? @@@ Yea, fix-base version has no minus here. zAForces[link].bottom = vA.cross(I * vA); } __syncwarp(); for (PxU32 link = (linkCount - 1); link > 0; --link) { const PxTransform& body2World = arti.linkBody2Worlds[link]; const PxU32 parentLink = arti.parents[link]; const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink]; if (threadIndex == 0) zAForces[parentLink] += translateSpatialVector(body2World.p - parentBody2World.p, zAForces[link]); } __syncwarp(); // Compute motion acceleration if (threadIndex == 0) { Dy::SpatialMatrix invInertia = spatialInertias[0].invertInertia(); motionAccelerations[0] = -(invInertia * zAForces[0]); } for (PxU32 link = 1; link < linkCount; ++link) { const PxTransform& body2World = arti.linkBody2Worlds[link]; const PxU32 parentLink = arti.parents[link]; const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink]; if (threadIndex == 0) { motionAccelerations[link] = translateSpatialVector(parentBody2World.p - body2World.p, motionAccelerations[parentLink]); zAForces[link] = spatialInertias[link] * motionAccelerations[link] + zAForces[link]; } const ArticulationJointCoreData& jointData = arti.jointData[link]; PxReal* force = &gravityCompensationForces[jointData.jointOffset]; if (threadIndex < jointData.nbDof) force[threadIndex] = link; // I'll just store dof's link index here and use it in the next loop } __syncwarp(); // Compute joint forces for (PxU32 dof = threadIndex; dof < dofCount; dof += WARP_SIZE) { PxU32 link = (PxU32)gravityCompensationForces[dof]; const ArticulationJointCoreData& jointData = arti.jointData[link]; const PxTransform& body2World = arti.linkBody2Worlds[link]; Cm::UnAlignedSpatialVector dofMotion = arti.motionMatrix[link][dof - jointData.jointOffset].rotate(body2World); gravityCompensationForces[dof] = dofMotion.innerProduct(zAForces[link]); } __syncwarp(); } } } // This is an optimized port of FeatherstoneArticulation::getCoriolisAndCentrifugalForce. // We process one articulation per warp (32 threads). extern "C" __global__ void computeArtiCentrifugalForces( const PxU32 nbIndices, float* PX_RESTRICT data, const PxArticulationGPUIndex* PX_RESTRICT gpuIndices, const PxU32 maxDofs, const bool rootMotion, const PxgArticulation* articulations) { const PxU32 jobIndex = threadIdx.y + blockIdx.x * blockDim.y; const PxU32 threadIndex = threadIdx.x; if (jobIndex < nbIndices) { const PxU32 artiIndex = gpuIndices[jobIndex]; const PxgArticulation& arti = articulations[artiIndex]; Dy::SpatialMatrix* PX_RESTRICT spatialInertias = reinterpret_cast(arti.worldSpatialArticulatedInertia); PxReal* PX_RESTRICT jointVelocities = reinterpret_cast(arti.jointVelocities); Cm::UnAlignedSpatialVector* PX_RESTRICT motionVelocities = reinterpret_cast(arti.motionVelocities); Cm::UnAlignedSpatialVector* PX_RESTRICT corioliseVectors = reinterpret_cast(arti.corioliseVectors); // corioliseVectors is used as a tmp buffer to store motion accelerations of all links // arti.motionAccelerations cannot be used because it stores important information that should not be erased Cm::UnAlignedSpatialVector* PX_RESTRICT motionAccelerations = reinterpret_cast(arti.corioliseVectors); Cm::UnAlignedSpatialVector* PX_RESTRICT zAForces = reinterpret_cast(arti.zAForces); const PxU32 linkCount = arti.data.numLinks; const PxU32 dofCount = arti.data.numJointDofs; const bool fixBase = arti.data.flags & PxArticulationFlag::eFIX_BASE; const PxU32 rootDof = (rootMotion && !fixBase) ? 6 : 0; // Add the DoF of the root in the floating base case const PxU32 bufferDof = rootMotion ? 6 : 0; float* PX_RESTRICT coriolisForces = &data[jobIndex * (maxDofs + bufferDof)]; // Velocities // It seems to be unnecessary to recalculate the motion velocities as we always call // the update kinematic function before calling this function if (threadIndex == 0) { for (PxU32 link = 1; link < linkCount; ++link) { const PxTransform& body2World = arti.linkBody2Worlds[link]; const PxU32 parentLink = arti.parents[link]; const ArticulationJointCoreData& jointData = arti.jointData[link]; const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink]; const PxVec3 rw = body2World.p - parentBody2World.p; const Cm::UnAlignedSpatialVector pVel = motionVelocities[parentLink]; Cm::UnAlignedSpatialVector vel = translateSpatialVector(-rw, pVel); Cm::UnAlignedSpatialVector deltaV = Cm::UnAlignedSpatialVector::Zero(); for (PxU32 dof = 0; dof < jointData.nbDof; ++dof) { const PxReal maxJointVelocity = arti.joints[link].maxJointVelocity[dof]; const PxReal jointVelocity = jointVelocities[jointData.jointOffset + dof]; PxReal jVel = PxMin(jointVelocity, maxJointVelocity); Cm::UnAlignedSpatialVector dofMotion = arti.motionMatrix[link][dof].rotate(body2World); deltaV += dofMotion * jVel; } vel.top += deltaV.top; vel.bottom += deltaV.bottom; const PxVec3 aVec = deltaV.top; const PxVec3 force = pVel.top.cross(aVec); const PxVec3 lVel = deltaV.bottom; const PxVec3 torque = pVel.top.cross(pVel.top.cross(rw)) + 2.f * pVel.top.cross(lVel) + aVec.cross(lVel); corioliseVectors[link] = Cm::SpatialVectorF(force, torque); motionVelocities[link] = vel; } } __syncwarp(); if (threadIndex == 0) { motionAccelerations[0] = Cm::SpatialVectorF::Zero(); for (PxU32 link = 1; link < linkCount; ++link) { const PxTransform& body2World = arti.linkBody2Worlds[link]; const PxU32 parentLink = arti.parents[link]; const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink]; const PxVec3 rw = body2World.p - parentBody2World.p; Cm::UnAlignedSpatialVector pMotionAcceleration = translateSpatialVector(-rw, motionAccelerations[parentLink]); motionAccelerations[link] = pMotionAcceleration + corioliseVectors[link]; } } __syncwarp(); for (PxU32 link = threadIndex; link < linkCount; link += WARP_SIZE) { const PxTransform& body2World = arti.linkBody2Worlds[link]; const float4 invIM = arti.linkProps[link].invInertiaXYZ_invMass; const PxReal mass = invIM.w == 0.f ? 0.f : (1.f / invIM.w); const PxVec3 localInertia = PxVec3(invIM.x == 0.f ? 0.f : (1.f / invIM.x), invIM.y == 0.f ? 0.f : (1.f / invIM.y), invIM.z == 0.f ? 0.f : (1.f / invIM.z)); PxMat33 I; Cm::transformInertiaTensor(localInertia, PxMat33(body2World.q), I); const PxVec3& vA = motionVelocities[link].top; zAForces[link].top = motionAccelerations[link].bottom * mass; zAForces[link].bottom = vA.cross(I * vA) + I * motionAccelerations[link].top; } __syncwarp(); if (threadIndex == 0) { for (PxU32 link = (linkCount - 1); link > 0; --link) { const PxTransform& body2World = arti.linkBody2Worlds[link]; const PxU32 parentLink = arti.parents[link]; const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink]; zAForces[parentLink] += translateSpatialVector(body2World.p - parentBody2World.p, zAForces[link]); const ArticulationJointCoreData& jointData = arti.jointData[link]; PxReal* force = &coriolisForces[jointData.jointOffset]; for (PxU32 dof = 0; dof < jointData.nbDof; ++dof) force[dof + rootDof] = link; // I'll just store dof's link index here and use it in the last loop } } __syncwarp(); if (!fixBase) { // Compute links inertias for (PxU32 link = threadIndex; link < linkCount; link += WARP_SIZE) { const float4 invIM = arti.linkProps[link].invInertiaXYZ_invMass; const PxReal mass = invIM.w == 0.f ? 0.f : (1.f / invIM.w); const PxVec3 localInertia = PxVec3(invIM.x == 0.f ? 0.f : (1.f / invIM.x), invIM.y == 0.f ? 0.f : (1.f / invIM.y), invIM.z == 0.f ? 0.f : (1.f / invIM.z)); Dy::SpatialMatrix spatialInertia; spatialInertia.topLeft = PxMat33(PxZero); spatialInertia.topRight = PxMat33::createDiagonal(PxVec3(mass)); Cm::transformInertiaTensor(localInertia, PxMat33(arti.linkBody2Worlds[link].q), spatialInertia.bottomLeft); spatialInertias[link] = spatialInertia; } __syncwarp(); if (threadIndex == 0) { for (PxU32 link = linkCount - 1; link > 0; --link) { const PxTransform& body2World = arti.linkBody2Worlds[link]; const PxU32 parentLink = arti.parents[link]; const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink]; const PxVec3 rw = body2World.p - parentBody2World.p; PxMat33 skewSymmetric(PxVec3(0, rw.z, -rw.y), PxVec3(-rw.z, 0, rw.x), PxVec3(rw.y, -rw.x, 0)); Dy::SpatialMatrix parentSpaceSpatialInertia = spatialInertias[link]; translateInertia(skewSymmetric, parentSpaceSpatialInertia); spatialInertias[parentLink] += parentSpaceSpatialInertia; } } __syncwarp(); // Compute motion acceleration if (threadIndex == 0) { if (!rootMotion) { Dy::SpatialMatrix invInertia = spatialInertias[0].invertInertia(); motionAccelerations[0] = -(invInertia * zAForces[0]); } for (PxU32 link = 1; link < linkCount; ++link) { const PxTransform& body2World = arti.linkBody2Worlds[link]; const PxU32 parentLink = arti.parents[link]; const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink]; motionAccelerations[link] = translateSpatialVector(parentBody2World.p - body2World.p, motionAccelerations[parentLink]); zAForces[link] = spatialInertias[link] * motionAccelerations[link] + zAForces[link]; } } __syncwarp(); } // Compute joint forces for (PxU32 dof = threadIndex; dof < dofCount; dof += WARP_SIZE) { PxU32 link = (PxU32)coriolisForces[dof + rootDof]; const ArticulationJointCoreData& jointData = arti.jointData[link]; const PxTransform& body2World = arti.linkBody2Worlds[link]; Cm::UnAlignedSpatialVector dofMotion = arti.motionMatrix[link][dof - jointData.jointOffset].rotate(body2World); coriolisForces[dof + rootDof] = dofMotion.innerProduct(zAForces[link]); } // Add root contribution if (rootMotion && threadIndex == 0 && !fixBase) { coriolisForces[0] = zAForces[0].top.x; coriolisForces[1] = zAForces[0].top.y; coriolisForces[2] = zAForces[0].top.z; coriolisForces[3] = zAForces[0].bottom.x; coriolisForces[4] = zAForces[0].bottom.y; coriolisForces[5] = zAForces[0].bottom.z; } __syncwarp(); } } // This is an optimized port of FeatherstoneArticulation::getArticulationCOM // We process one articulation per warp (32 threads) and one link per thread. // We have to separate the calculation of the total mass from the calculation // of the sum of COM in case there are more than 32 links in one articulation. extern "C" __global__ void computeArtiCOM( PxVec3* PX_RESTRICT data, const PxArticulationGPUIndex* PX_RESTRICT gpuIndices, const bool rootFrame, const PxgArticulation* PX_RESTRICT articulations) { const PxU32 jobIndex = threadIdx.y + blockIdx.x * blockDim.y; const PxU32 threadIndex = threadIdx.x; const PxU32 artiIndex = gpuIndices[jobIndex]; const PxgArticulation& arti = articulations[artiIndex]; PxVec3* articulationCOM = &data[jobIndex]; const PxU32 linkCount = arti.data.numLinks; // calculate the total mass of the articulation and store it temporarily in articulationCOM->x for(PxU32 link = threadIndex; link < linkCount; link += WARP_SIZE) { const PxReal mass = 1.0f / arti.linkProps[link].invInertiaXYZ_invMass.w; // calculate the total mass and store it temporarily in articulationCOM->x PxRedAddGlobal(&articulationCOM->x, mass); } __syncwarp(); PxReal totalMass = articulationCOM->x; // calculate the COM as the sum of body COM for(PxU32 link = threadIndex; link < linkCount; link += WARP_SIZE) { const PxVec3 childPose = arti.linkBody2Worlds[link].p; const PxReal mass = 1.0f / arti.linkProps[link].invInertiaXYZ_invMass.w; const PxVec3 COM = childPose * mass; // calculate the COM as the sum of body COM PxRedAddGlobal(&articulationCOM->x, COM[0]); PxRedAddGlobal(&articulationCOM->y, COM[1]); PxRedAddGlobal(&articulationCOM->z, COM[2]); } if(threadIndex == 0) { // remove the total mass from articulationCOM->x articulationCOM->x -= totalMass; *articulationCOM /= totalMass; // move to the root frame if(rootFrame) { *articulationCOM = arti.linkBody2Worlds[0].getInverse().transform(*articulationCOM); } } __syncwarp(); } // This is an optimized port of FeatherstoneArticulation::getCentroidalMomentumMatrix // We process one articulation per warp (32 threads) and one link per thread. extern "C" __global__ void computeArtiCentroidalMomentumMatrices( float* PX_RESTRICT data, const PxArticulationGPUIndex* PX_RESTRICT gpuIndices, const PxU32 nbIndices, const PxU32 maxDofs, const PxgArticulation* PX_RESTRICT articulations) { const PxU32 jobIndex = threadIdx.y + blockIdx.x * blockDim.y; const PxU32 threadIndex = threadIdx.x; if(jobIndex < nbIndices) { const PxU32 artiIndex = gpuIndices[jobIndex]; const PxgArticulation& arti = articulations[artiIndex]; const PxU32 dofCount = arti.data.numJointDofs; const bool fixBase = arti.data.flags & PxArticulationFlag::eFIX_BASE; if(!fixBase) { const PxU32 startCoriolisForces = (maxDofs + 6) * (maxDofs + 6) * nbIndices; const PxU32 startCentroidalMomentumMatrices = startCoriolisForces + (maxDofs + 6) * nbIndices; const PxU32 startCentroidalMomentumBias = startCentroidalMomentumMatrices + 6 * (maxDofs + 6) * nbIndices; float* PX_RESTRICT massMatrices = &data[jobIndex * (maxDofs + 6) * (maxDofs + 6)]; float* PX_RESTRICT coriolisForces = &data[startCoriolisForces + jobIndex * (maxDofs + 6)]; float* PX_RESTRICT centroidalMomentumMatrix = &data[startCentroidalMomentumMatrices + jobIndex * 6 * (maxDofs + 6)]; float* PX_RESTRICT centroidalMomentumBias = &data[startCentroidalMomentumBias + jobIndex * 6]; // adding mass matrix terms corresponding to the root DoFs for(PxU32 j = threadIndex; j < 6 * (dofCount + 6); j += WARP_SIZE) { centroidalMomentumMatrix[j] = massMatrices[j]; } // calculating the COM momentum from the spatial articulated inertia of the root in the mass matrix const PxReal totalMass = massMatrices[2 * (dofCount + 6) + 2]; const PxVec3 pg = PxVec3(massMatrices[2 * (dofCount + 6) + 4], massMatrices[5], massMatrices[(dofCount + 6) + 3]) / totalMass; const PxMat33 mat(PxVec3(0, pg.z, -pg.y), PxVec3(-pg.z, 0, pg.x), PxVec3(pg.y, -pg.x, 0)); // adding remaining contributions for(PxU32 col = threadIndex; col < dofCount + 6; col += WARP_SIZE) { const PxVec3 linMassMatrices = PxVec3(massMatrices[0 * (dofCount + 6) + col], massMatrices[1 * (dofCount + 6) + col], massMatrices[2 * (dofCount + 6) + col]); const PxVec3 crossTermsCentroidalMomentumMatrix = mat * linMassMatrices; for(PxU32 row = 3; row < 6; ++row) { centroidalMomentumMatrix[col + row * (dofCount + 6)] += crossTermsCentroidalMomentumMatrix[row - 3]; } } if(threadIndex == 0) { // adding coriolis force terms corresponding to the root DoFs for(PxU32 row = 0; row < 6; ++row) { centroidalMomentumBias[row] = coriolisForces[row]; } // adding remaining contributions const PxVec3 linCoriolisForces = PxVec3(coriolisForces[0], coriolisForces[1], coriolisForces[2]); const PxVec3 crossTermsCentroidalMomentumBias = mat * linCoriolisForces; for(PxU32 row = 3; row < 6; ++row) { centroidalMomentumBias[row] += crossTermsCentroidalMomentumBias[row - 3]; } } } } }