Files
XCEngine/engine/third_party/physx/source/gpuarticulation/src/CUDA/inverseDynamic.cu

1150 lines
44 KiB
Plaintext

// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#include "PxgArticulationLink.h"
#include "DyArticulationJointCore.h"
#include "DyFeatherstoneArticulationUtils.h"
#include "DyFeatherstoneArticulationJointData.h"
#include "DyArticulationCore.h"
#include "PxgArticulationCoreKernelIndices.h"
#include "PxgBodySim.h"
#include "reduction.cuh"
#include "copy.cuh"
#include "articulationDynamic.cuh"
#include "articulationImpulseResponse.cuh"
#include "MemoryAllocator.cuh"
#include "PxSpatialMatrix.h"
#include "foundation/PxMath.h"
#include "DyFeatherstoneArticulation.h"
#include "PxConstraintDesc.h"
#include "DyConstraintPrep.h"
#include "PxsRigidBody.h"
#include <stdio.h>
using namespace physx;
using namespace Dy;
extern "C" __host__ void initArticulationKernels4() {}
// VR: Old version for reference
#if 0
// This is a simple port of FeatherstoneArticulation::getDenseJacobian.
// We process one articulation per thread, which is not very efficient.
extern "C" __global__ void computeArtiDenseJacobians(
const PxIndexDataPair* indexMap,
const PxU32 nbIndices,
const PxgArticulation* articulations)
{
const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
if (globalThreadIndex < nbIndices)
{
PxU32 artiIndex = indexMap[globalThreadIndex].index;
const PxgArticulation& arti = articulations[artiIndex];
PxU32 linkCount = arti.data.numLinks;
//PxU32 jointCount = linkCount - 1;
PxU32 dofCount = arti.data.numJointDofs;
bool fixBase = arti.data.flags & PxArticulationFlag::eFIX_BASE;
PxU32 nCols = (fixBase ? 0 : 6) + dofCount;
//PxU32 nRows = (fixBase ? 0 : 6) + jointCount * 6;
//printf("~!~!~! processing arti %u (%u): %u links, %u dofs, fixed? %u, %u rows, %u cols\n",
// globalThreadIndex, artiIndex, linkCount, dofCount, PxU32(fixBase), nRows, nCols);
float* dataPtr = static_cast<float*>(indexMap[globalThreadIndex].data);
#define jacobian(row, col) dataPtr[nCols * (row) + (col)]
PxU32 destRow = 0;
PxU32 destCol = 0;
if (!fixBase)
{
jacobian(0, 0) = 1.0f;
jacobian(0, 1) = 0.0f;
jacobian(0, 2) = 0.0f;
jacobian(0, 3) = 0.0f;
jacobian(0, 4) = 0.0f;
jacobian(0, 5) = 0.0f;
jacobian(1, 0) = 0.0f;
jacobian(1, 1) = 1.0f;
jacobian(1, 2) = 0.0f;
jacobian(1, 3) = 0.0f;
jacobian(1, 4) = 0.0f;
jacobian(1, 5) = 0.0f;
jacobian(2, 0) = 0.0f;
jacobian(2, 1) = 0.0f;
jacobian(2, 2) = 1.0f;
jacobian(2, 3) = 0.0f;
jacobian(2, 4) = 0.0f;
jacobian(2, 5) = 0.0f;
jacobian(3, 0) = 0.0f;
jacobian(3, 1) = 0.0f;
jacobian(3, 2) = 0.0f;
jacobian(3, 3) = 1.0f;
jacobian(3, 4) = 0.0f;
jacobian(3, 5) = 0.0f;
jacobian(4, 0) = 0.0f;
jacobian(4, 1) = 0.0f;
jacobian(4, 2) = 0.0f;
jacobian(4, 3) = 0.0f;
jacobian(4, 4) = 1.0f;
jacobian(4, 5) = 0.0f;
jacobian(5, 0) = 0.0f;
jacobian(5, 1) = 0.0f;
jacobian(5, 2) = 0.0f;
jacobian(5, 3) = 0.0f;
jacobian(5, 4) = 0.0f;
jacobian(5, 5) = 1.0f;
destRow += 6;
destCol += 6;
}
for (PxU32 linkID = 1; linkID < linkCount; ++linkID)//each iteration of this writes 6 rows in the matrix
{
//const PxgArticulationLink& link = arti.links[linkID];
const PxTransform& body2World = arti.linkBody2Worlds[linkID];
const ArticulationJointCoreData& jointDatum = arti.jointData[linkID];
const PxU32 parentLinkID = arti.parents[linkID];
if (parentLinkID || !fixBase)
{
// VR: It arti.jointData[0] isn't initialized
//const PxU32 parentsFirstDestCol = arti.jointData[parentLinkID].jointOffset + (fixBase ? 0 : 6);
const PxU32 parentsLastDestCol = parentLinkID ? arti.jointData[parentLinkID].jointOffset + (fixBase ? 0 : 6) + arti.jointData[parentLinkID].dof : 6;
// VR: For parentLinkID == 0, it's two unsigned it overflows, though the result is still correct.
const PxU32 parentsDestRow = (fixBase ? 0 : 6) + (parentLinkID - 1) * 6;
// no commonInit, so need to compute the rw vector here
const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLinkID];
const PxVec3 rw = body2World.p - parentBody2World.p;
// VR: parentsLastDestCol is rather column count than the last column index
//for (PxU32 col = 0; col <= parentsLastDestCol; col++)
for (PxU32 col = 0; col < parentsLastDestCol; col++)
{
//copy downward the 6 cols from parent
const PxVec3 parentAng(jacobian(parentsDestRow + 3, col),
jacobian(parentsDestRow + 4, col),
jacobian(parentsDestRow + 5, col));
const PxVec3 parentAngxRw = parentAng.cross(rw);
jacobian(destRow + 0, col) = jacobian(parentsDestRow + 0, col) + parentAngxRw.x;
jacobian(destRow + 1, col) = jacobian(parentsDestRow + 1, col) + parentAngxRw.y;
jacobian(destRow + 2, col) = jacobian(parentsDestRow + 2, col) + parentAngxRw.z;
jacobian(destRow + 3, col) = parentAng.x;
jacobian(destRow + 4, col) = parentAng.y;
jacobian(destRow + 5, col) = parentAng.z;
}
// VR: parentsLastDestCol is rather column count than the last column index
//for (PxU32 col = parentsLastDestCol + 1; col < destCol; col++)
for (PxU32 col = parentsLastDestCol; col < destCol; col++)
{
//fill with zeros.
jacobian(destRow + 0, col) = 0.0f;
jacobian(destRow + 1, col) = 0.0f;
jacobian(destRow + 2, col) = 0.0f;
jacobian(destRow + 3, col) = 0.0f;
jacobian(destRow + 4, col) = 0.0f;
jacobian(destRow + 5, col) = 0.0f;
}
}
//diagonal block:
for (PxU32 ind = 0; ind < jointDatum.dof; ++ind)
{
const Cm::UnAlignedSpatialVector& v = arti.motionMatrix[linkID][ind];
const PxVec3 ang = body2World.rotate(v.top);
const PxVec3 lin = body2World.rotate(v.bottom);
jacobian(destRow + 0, destCol) = lin.x;
jacobian(destRow + 1, destCol) = lin.y;
jacobian(destRow + 2, destCol) = lin.z;
jacobian(destRow + 3, destCol) = ang.x;
jacobian(destRow + 4, destCol) = ang.y;
jacobian(destRow + 5, destCol) = ang.z;
destCol++;
}
//above diagonal block: always zero
for (PxU32 col = destCol; col < nCols; col++)
{
jacobian(destRow + 0, col) = 0.0f;
jacobian(destRow + 1, col) = 0.0f;
jacobian(destRow + 2, col) = 0.0f;
jacobian(destRow + 3, col) = 0.0f;
jacobian(destRow + 4, col) = 0.0f;
jacobian(destRow + 5, col) = 0.0f;
}
destRow += 6;
}
#undef jacobian
}
}
#endif
extern "C" __global__ void computeArtiDenseJacobians(
float* PX_RESTRICT data,
const PxArticulationGPUIndex* PX_RESTRICT gpuIndices,
const PxU32 nbIndices,
const PxU32 maxLinks,
const PxU32 maxDofs,
const PxgArticulation* PX_RESTRICT articulations)
{
const PxU32 jobIndex = threadIdx.y + blockIdx.x * blockDim.y;
const PxU32 threadIndex = threadIdx.x;
if (jobIndex < nbIndices)
{
const PxArticulationGPUIndex artiIndex = gpuIndices[jobIndex];
const PxgArticulation& arti = articulations[artiIndex];
const PxU32 linkCount = arti.data.numLinks;
const PxU32 dofCount = arti.data.numJointDofs;
const bool fixedBase = arti.data.flags & PxArticulationFlag::eFIX_BASE;
const PxU32 baseDofs = fixedBase ? 0 : 6;
const PxU32 colCount = baseDofs + dofCount;
const PxU32 maxCols = 6 + maxDofs;
const PxU32 maxRows = 6 + (maxLinks - 1) * 6;
float* PX_RESTRICT jacobian = &data[jobIndex * maxCols * maxRows];
if (!fixedBase)
{
for (PxU32 i = threadIndex; i < 6 * colCount; i += WARP_SIZE)
jacobian[i] = (i / colCount == i % colCount) ? 1.0f : 0.0f;
__syncwarp();
}
for (PxU32 link = 1; link < linkCount; ++link)
{
const PxU32 linkOffset = arti.jointData[link].jointOffset + baseDofs;
const PxU32 linkDofs = arti.jointData[link].nbDof;
const PxU32 linkRow0 = (link - 1) * 6 + baseDofs;
const PxTransform& body2World = arti.linkBody2Worlds[link];
const PxU32 parentLink = arti.parents[link];
const PxU32 parentDofs = parentLink ? arti.jointData[parentLink].jointOffset + arti.jointData[parentLink].nbDof + baseDofs : baseDofs;
for (PxU32 dof = threadIndex; dof < colCount; dof += WARP_SIZE)
{
PxVec3 linkLin(0), linkAng(0);
if (dof < parentDofs)
{
const PxU32 parentRow0 = parentLink ? (parentLink - 1) * 6 + baseDofs : 0;
const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink];
const PxVec3 rw = body2World.p - parentBody2World.p;
const PxVec3 parentLin(jacobian[(parentRow0 + 0) * colCount + dof],
jacobian[(parentRow0 + 1) * colCount + dof],
jacobian[(parentRow0 + 2) * colCount + dof]);
const PxVec3 parentAng(jacobian[(parentRow0 + 3) * colCount + dof],
jacobian[(parentRow0 + 4) * colCount + dof],
jacobian[(parentRow0 + 5) * colCount + dof]);
const PxVec3 parentAngxRw = parentAng.cross(rw);
linkLin = parentLin + parentAngxRw;
linkAng = parentAng;
}
else if (dof >= linkOffset && dof < linkOffset + linkDofs)
{
const Cm::UnAlignedSpatialVector& v = arti.motionMatrix[link][dof - linkOffset];
linkLin = body2World.rotate(v.bottom);
linkAng = body2World.rotate(v.top);
}
__syncwarp();
jacobian[(linkRow0 + 0) * colCount + dof] = linkLin.x;
jacobian[(linkRow0 + 1) * colCount + dof] = linkLin.y;
jacobian[(linkRow0 + 2) * colCount + dof] = linkLin.z;
jacobian[(linkRow0 + 3) * colCount + dof] = linkAng.x;
jacobian[(linkRow0 + 4) * colCount + dof] = linkAng.y;
jacobian[(linkRow0 + 5) * colCount + dof] = linkAng.z;
}
}
}
}
__device__ static void translateInertia(const PxMat33& sTod, Dy::SpatialMatrix& inertia)
{
const PxMat33 dTos = sTod.getTranspose();
PxMat33 bl = sTod * inertia.topLeft + inertia.bottomLeft;
PxMat33 br = sTod * inertia.topRight + inertia.getBottomRight();
inertia.topLeft = inertia.topLeft + inertia.topRight * dTos;
inertia.bottomLeft = bl + br * dTos;
//aligned inertia - make it symmetrical! OPTIONAL!!!!
inertia.bottomLeft = (inertia.bottomLeft + inertia.bottomLeft.getTranspose()) * 0.5f;
}
// This is a simple port of FeatherstoneArticulation::getGeneralizedMassMatrixCRB.
// I process one articulation per warp (32 threads).
extern "C" __global__ void computeArtiMassMatrices(
const PxU32 nbIndices,
float* PX_RESTRICT data,
const PxArticulationGPUIndex* PX_RESTRICT gpuIndices,
const PxU32 maxDofs,
const bool rootMotion,
const PxgArticulation* PX_RESTRICT articulations)
{
const PxU32 jobIndex = threadIdx.y + blockIdx.x * blockDim.y;
const PxU32 threadIndex = threadIdx.x;
if (jobIndex < nbIndices)
{
const PxU32 artiIndex = gpuIndices[jobIndex];
const PxgArticulation& arti = articulations[artiIndex];
// I use this as a tmp buffer to store world space spatial inertias of all links
Dy::SpatialMatrix* PX_RESTRICT spatialInertias = reinterpret_cast<Dy::SpatialMatrix*>(arti.worldSpatialArticulatedInertia); // ???
// I use this as a tmp buffer to store world space motiom vectors multiplied by their links inertias
Cm::UnAlignedSpatialVector* PX_RESTRICT worldMotionMatrices = reinterpret_cast<Cm::UnAlignedSpatialVector*>(arti.worldMotionMatrix); // ???
const PxU32 linkCount = arti.data.numLinks;
const PxU32 dofCount = arti.data.numJointDofs;
const bool fixBase = arti.data.flags & PxArticulationFlag::eFIX_BASE;
const PxU32 rootDof = (rootMotion && !fixBase) ? 6 : 0; // Add the DoF of the root in the floating base case
const PxU32 bufferDof = rootMotion ? 6 : 0;
const PxU32 matSize = dofCount + rootDof;
float* massMatrix = &data[jobIndex * (maxDofs + bufferDof) * (maxDofs + bufferDof)];
for (int i = threadIndex; i < matSize * matSize; i += WARP_SIZE)
massMatrix[i] = 0;
__syncwarp();
for (PxU32 link = threadIndex; link < linkCount; link += WARP_SIZE)
{
const float4 invIM = arti.linkProps[link].invInertiaXYZ_invMass;
const PxReal mass = invIM.w == 0.f ? 0.f : (1.f / invIM.w);
const PxVec3 localInertia = PxVec3(invIM.x == 0.f ? 0.f : (1.f / invIM.x),
invIM.y == 0.f ? 0.f : (1.f / invIM.y),
invIM.z == 0.f ? 0.f : (1.f / invIM.z));
Dy::SpatialMatrix spatialInertia;
spatialInertia.topLeft = PxMat33(PxZero);
spatialInertia.topRight = PxMat33::createDiagonal(PxVec3(mass));
spatialInertia.padding = 0; // VR: I use this to store the order to compute combined inertias
Cm::transformInertiaTensor(localInertia, PxMat33(arti.linkBody2Worlds[link].q), spatialInertia.bottomLeft);
spatialInertias[link] = spatialInertia;
}
__syncwarp();
#if 1 // VR: This optimization allows computing some of combined inertias in parallel. It's a bit faster for humanoids, but not much.
PxU32 maxStep = 0;
if (threadIndex == 0)
{
for (PxU32 link = linkCount - 1; link > 0; --link)
{
PxU32& s = spatialInertias[link].padding;
const PxU32 parentLink = arti.parents[link];
PxU32& ps = spatialInertias[parentLink].padding;
if (s < ps) s = ps;
ps = s + 1;
if (maxStep < ps) maxStep = ps;
}
}
__syncwarp();
maxStep = __shfl_sync(0xffffffff, maxStep, 0);
for (PxU32 step = 0; step <= maxStep; ++step)
{
for (PxU32 link = threadIndex + 1; link < linkCount; link += WARP_SIZE)
{
const PxU32& s = spatialInertias[link].padding;
if (s == step)
{
const PxTransform& body2World = arti.linkBody2Worlds[link];
const PxU32 parentLink = arti.parents[link];
const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink];
const PxVec3 rw = body2World.p - parentBody2World.p;
PxMat33 skewSymmetric(PxVec3(0, rw.z, -rw.y), PxVec3(-rw.z, 0, rw.x), PxVec3(rw.y, -rw.x, 0));
Dy::SpatialMatrix parentSpaceSpatialInertia = spatialInertias[link];
translateInertia(skewSymmetric, parentSpaceSpatialInertia);
spatialInertias[parentLink] += parentSpaceSpatialInertia;
}
}
__syncwarp();
}
#else
if (threadIndex == 0) // @@@ ??? Can I optimize it somehow?
{
for (PxU32 link = linkCount - 1; link > 0; --link)
{
const PxTransform& body2World = arti.linkBody2Worlds[link];
const PxU32 parentLink = arti.parents[link];
const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink];
const PxVec3 rw = body2World.p - parentBody2World.p;
PxMat33 skewSymmetric(PxVec3(0, rw.z, -rw.y), PxVec3(-rw.z, 0, rw.x), PxVec3(rw.y, -rw.x, 0));
Dy::SpatialMatrix parentSpaceSpatialInertia = spatialInertias[link];
translateInertia(skewSymmetric, parentSpaceSpatialInertia);
spatialInertias[parentLink] += parentSpaceSpatialInertia;
}
}
__syncwarp();
#endif
// Loop is for the case where there are more than 32 links in one articulation
for (PxU32 link = threadIndex + 1; link < linkCount; link += WARP_SIZE)
{
const PxTransform& body2World = arti.linkBody2Worlds[link];
const ArticulationJointCoreData& jointData = arti.jointData[link];
const Dy::SpatialMatrix& spatialInertia = spatialInertias[link];
Cm::UnAlignedSpatialVector worldMotionMatrix[3];
Cm::UnAlignedSpatialVector* spatialInertia_worldMotionMatrix = &worldMotionMatrices[jointData.jointOffset];
for (PxU32 dof = 0; dof < jointData.nbDof; ++dof)
{
const Cm::UnAlignedSpatialVector& m = arti.motionMatrix[link][dof];
worldMotionMatrix[dof] = m.rotate(body2World);
spatialInertia_worldMotionMatrix[dof] = spatialInertia * worldMotionMatrix[dof];
}
// A unit joint acceleration is assumed for each dof of the considered link
// This calculates the diagonal terms using the spatial articulated inertia for obtaining
// the force applied to the considered dof due to the applied unit joint acceleration
for (PxU32 dof0 = 0; dof0 < jointData.nbDof; ++dof0)
{
const Cm::UnAlignedSpatialVector& Im0 = spatialInertia_worldMotionMatrix[dof0];
const PxU32 row = jointData.jointOffset + dof0 + rootDof;
for (PxU32 dof1 = 0; dof1 < jointData.nbDof; ++dof1)
{
const Cm::UnAlignedSpatialVector& m1 = worldMotionMatrix[dof1];
const PxU32 col = jointData.jointOffset + dof1 + rootDof;
massMatrix[row * matSize + col] = m1.innerProduct(Im0);
}
}
// The calculated force is then propagated inward to calculate the above diagonal terms of the mass matrix
// This is sufficient as the mass matrix is symmetric
PxTransform childBody2World = body2World;
PxU32 parentLink = arti.parents[link];
while (true)
{
const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink];
// The zero acceleration force is propagated inward
// This is correct because all links inward the considered link have no joint acceleration
const PxVec3 rw = childBody2World.p - parentBody2World.p;
for (PxU32 dof = 0; dof < jointData.nbDof; ++dof)
{
spatialInertia_worldMotionMatrix[dof] = FeatherstoneArticulation::translateSpatialVector(rw, spatialInertia_worldMotionMatrix[dof]);
}
childBody2World = parentBody2World;
if (parentLink == 0)
{
// For floating base, calculate the resulting force on the root
if (!fixBase && rootMotion)
{
for (PxU32 dof0 = 0; dof0 < jointData.nbDof; ++dof0)
{
const Cm::UnAlignedSpatialVector& Im0 = spatialInertia_worldMotionMatrix[dof0];
const PxU32 col = jointData.jointOffset + dof0 + rootDof;
for (PxU32 row = 0; row < 6; ++row)
{
massMatrix[col * matSize + row] += Im0[row];
massMatrix[col + row * matSize] += Im0[row];
}
}
}
break;
}
const ArticulationJointCoreData& parentJointData = arti.jointData[parentLink];
Cm::UnAlignedSpatialVector parentWorldMotionMatrix[3];
for (PxU32 dof = 0; dof < parentJointData.nbDof; ++dof)
{
const Cm::UnAlignedSpatialVector& m = arti.motionMatrix[parentLink][dof];
parentWorldMotionMatrix[dof].top = parentBody2World.rotate(m.top);
parentWorldMotionMatrix[dof].bottom = parentBody2World.rotate(m.bottom);
}
// The joint force is calculated from the zero acceleration force
for (PxU32 dof0 = 0; dof0 < jointData.nbDof; ++dof0)
{
const Cm::UnAlignedSpatialVector& Im0 = spatialInertia_worldMotionMatrix[dof0];
const PxU32 row = jointData.jointOffset + dof0 + rootDof;
for (PxU32 dof1 = 0; dof1 < parentJointData.nbDof; ++dof1)
{
const Cm::UnAlignedSpatialVector& m1 = parentWorldMotionMatrix[dof1];
const PxU32 col = parentJointData.jointOffset + dof1 + rootDof;
float m = m1.innerProduct(Im0);
massMatrix[row * matSize + col] = m;
massMatrix[row + col * matSize] = m;
}
}
parentLink = arti.parents[parentLink];
}
}
// Adding the spatial articulated inertia of the root
if (threadIndex == 0 && !fixBase && rootMotion)
{
// Note that the spatial articulated inertia assumes that the root angular acceleration comes first,
// while the mass matrix assumes that the root linear acceleration comes first
// We have therefore to invert the angular and linear component of the spatial articulated inertia
// This also ensures that the mass matrix is symmetric
const PxReal* rootSpatialInertia = reinterpret_cast<PxReal*>(&spatialInertias[0]);
for (PxU32 row = 0; row < 6; ++row)
{
const PxU32 rowSpatialInertia = (row < 3) ? row : row - 3; // Convert to the index of a 3 x 3 matrix
// Only process elements above the diagonal as the matrix is symmetric
for (PxU32 col = row; col < 6; ++col)
{
// This offset is due to how the spatial matrix is indexed and how the linear amd angular components
// of the acceleration should be inverted, the index is as follows
// 0 3 6 9 12 15 9 12 15 0 3 6
// 1 4 7 10 13 16 10 23 16 1 4 7
// 2 5 8 11 14 17 inversion linear/angular 11 24 17 2 5 8
// 18 21 24 0 1 2 ------------------------> 0 1 2 18 21 24
// 19 22 25 3 4 5 3 4 5 19 22 25
// 20 23 26 6 7 8 6 7 8 20 23 26
const PxU32 offset = (row > 2) ? 18 : (col < 3) * 9;
const PxU32 colSpatialInertia = (col < 3) ? col : col - 3; // Convert to the index of a 3 x 3 matrix
const PxU32 index = offset + colSpatialInertia * 3 + rowSpatialInertia;
massMatrix[row * matSize + col] = rootSpatialInertia[index];
massMatrix[col * matSize + row] = rootSpatialInertia[index];
}
}
}
else if (!fixBase && !rootMotion)
{
__syncwarp();
Dy::SpatialMatrix baseInvI = spatialInertias[0].invertInertia();
for (PxU32 row = threadIndex; row < dofCount; row += WARP_SIZE)
{
const Cm::UnAlignedSpatialVector& m0 = worldMotionMatrices[row];
for (PxU32 col = 0; col < dofCount; ++col)
{
const Cm::UnAlignedSpatialVector& m1 = worldMotionMatrices[col];
massMatrix[row * dofCount + col] -= m0.innerProduct(baseInvI * m1);
}
}
}
__syncwarp();
}
}
__device__ static Cm::UnAlignedSpatialVector translateSpatialVector(const PxVec3& offset, const Cm::UnAlignedSpatialVector& vec)
{
return Cm::UnAlignedSpatialVector(vec.top, vec.bottom + offset.cross(vec.top));
}
// This is an optimized port of FeatherstoneArticulation::getGeneralizedGravityForce.
// We process one articulation per warp (32 threads).
extern "C" __global__ void computeArtiGravityForces(
const PxU32 nbIndices,
float* PX_RESTRICT data,
const PxArticulationGPUIndex* PX_RESTRICT gpuIndices,
const PxU32 maxDofs,
const bool rootMotion,
const PxgArticulation* articulations, const PxVec3 gravity)
{
const PxU32 jobIndex = threadIdx.y + blockIdx.x * blockDim.y;
const PxU32 threadIndex = threadIdx.x;
if (jobIndex < nbIndices)
{
const PxU32 artiIndex = gpuIndices[jobIndex];
const PxgArticulation& arti = articulations[artiIndex];
const PxU32 linkCount = arti.data.numLinks;
const PxU32 dofCount = arti.data.numJointDofs;
const bool fixBase = arti.data.flags & PxArticulationFlag::eFIX_BASE;
const PxU32 rootDof = (rootMotion && !fixBase) ? 6 : 0; // Add the DoF of the root in the floating base case
const PxU32 bufferDof = rootMotion ? 6 : 0;
const PxVec3 tGravity = -gravity;
float* PX_RESTRICT gravityCompensationForces = &data[jobIndex * (maxDofs + bufferDof)];
if (rootMotion || fixBase)
{
// I use this as a tmp buffer to store ZAForce vectors of all links
Cm::UnAlignedSpatialVector* PX_RESTRICT zAForces = reinterpret_cast<Cm::UnAlignedSpatialVector*>(arti.zAForces); // ???
for (PxU32 link = threadIndex; link < linkCount; link += WARP_SIZE)
{
const float4 invIM = arti.linkProps[link].invInertiaXYZ_invMass;
const PxReal mass = invIM.w == 0.f ? 0.f : (1.f / invIM.w);
zAForces[link].top = tGravity * mass;
zAForces[link].bottom = PxVec3(0);
}
__syncwarp();
for (PxU32 link = (linkCount - 1); link > 0; --link)
{
const PxTransform& body2World = arti.linkBody2Worlds[link];
const PxU32 parentLink = arti.parents[link];
const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink];
if (threadIndex == 0)
zAForces[parentLink] += translateSpatialVector(body2World.p - parentBody2World.p, zAForces[link]);
const ArticulationJointCoreData& jointData = arti.jointData[link];
PxReal* force = &gravityCompensationForces[jointData.jointOffset];
if (threadIndex < jointData.nbDof)
force[threadIndex + rootDof] = link; // I'll just store dof's link index here and use it in the next loop
}
__syncwarp();
for (PxU32 dof = threadIndex; dof < dofCount; dof += WARP_SIZE)
{
PxU32 link = (PxU32)gravityCompensationForces[dof + rootDof];
const ArticulationJointCoreData& jointData = arti.jointData[link];
const PxTransform& body2World = arti.linkBody2Worlds[link];
Cm::UnAlignedSpatialVector dofMotion = arti.motionMatrix[link][dof - jointData.jointOffset].rotate(body2World);
gravityCompensationForces[dof + rootDof] = dofMotion.innerProduct(zAForces[link]);
}
// Add root DoFs contribution
if (threadIndex == 0 && rootDof == 6)
{
gravityCompensationForces[0] = zAForces[0].top.x;
gravityCompensationForces[1] = zAForces[0].top.y;
gravityCompensationForces[2] = zAForces[0].top.z;
gravityCompensationForces[3] = zAForces[0].bottom.x;
gravityCompensationForces[4] = zAForces[0].bottom.y;
gravityCompensationForces[5] = zAForces[0].bottom.z;
}
__syncwarp();
}
else
{
// I use this as a tmp buffer to store world space spatial inertias of all links
Dy::SpatialMatrix* PX_RESTRICT spatialInertias = reinterpret_cast<Dy::SpatialMatrix*>(arti.worldSpatialArticulatedInertia); // ???
// I use this as a tmp buffer to store motion velocities of all links ??? Or not. The velocities seem valid. Maybe I don't need to compute them ???
Cm::UnAlignedSpatialVector* PX_RESTRICT motionVelocities = reinterpret_cast<Cm::UnAlignedSpatialVector*>(arti.motionVelocities); // ???
// I use this as a tmp buffer to store motion accelerations of all links
Cm::UnAlignedSpatialVector* PX_RESTRICT motionAccelerations = reinterpret_cast<Cm::UnAlignedSpatialVector*>(arti.motionAccelerations); // ???
// I use this as a tmp buffer to store ZAForce vectors of all links
Cm::UnAlignedSpatialVector* PX_RESTRICT zAForces = reinterpret_cast<Cm::UnAlignedSpatialVector*>(arti.zAForces); // ???
// Compute links inertias
for (PxU32 link = threadIndex; link < linkCount; link += WARP_SIZE)
{
const float4 invIM = arti.linkProps[link].invInertiaXYZ_invMass;
const PxReal mass = invIM.w == 0.f ? 0.f : (1.f / invIM.w);
const PxVec3 localInertia = PxVec3(invIM.x == 0.f ? 0.f : (1.f / invIM.x), invIM.y == 0.f ? 0.f : (1.f / invIM.y), invIM.z == 0.f ? 0.f : (1.f / invIM.z));
Dy::SpatialMatrix spatialInertia;
spatialInertia.topLeft = PxMat33(PxZero);
spatialInertia.topRight = PxMat33::createDiagonal(PxVec3(mass));
Cm::transformInertiaTensor(localInertia, PxMat33(arti.linkBody2Worlds[link].q), spatialInertia.bottomLeft);
spatialInertias[link] = spatialInertia;
}
__syncwarp();
if (threadIndex == 0) // @@@ ??? Can I optimize it somehow?
{
for (PxU32 link = linkCount - 1; link > 0; --link)
{
const PxTransform& body2World = arti.linkBody2Worlds[link];
const PxU32 parentLink = arti.parents[link];
const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink];
const PxVec3 rw = body2World.p - parentBody2World.p;
PxMat33 skewSymmetric(PxVec3(0, rw.z, -rw.y), PxVec3(-rw.z, 0, rw.x), PxVec3(rw.y, -rw.x, 0));
Dy::SpatialMatrix parentSpaceSpatialInertia = spatialInertias[link];
translateInertia(skewSymmetric, parentSpaceSpatialInertia);
spatialInertias[parentLink] += parentSpaceSpatialInertia;
}
}
__syncwarp();
// Compute motion velocities ??? Should I even do this?
if (threadIndex == 0)
{
for (PxU32 link = 1; link < linkCount; ++link)
{
const PxTransform& body2World = arti.linkBody2Worlds[link];
const PxU32 parentLink = arti.parents[link];
const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink];
motionVelocities[link] = translateSpatialVector(parentBody2World.p - body2World.p, motionVelocities[parentLink]);
}
}
__syncwarp();
// Init za-forces
for (PxU32 link = threadIndex; link < linkCount; link += WARP_SIZE)
{
const float4 invIM = arti.linkProps[link].invInertiaXYZ_invMass;
const PxReal mass = invIM.w == 0.f ? 0.f : (1.f / invIM.w);
const PxMat33& I = spatialInertias[link].bottomLeft;
const PxVec3& vA = motionVelocities[link].top;
zAForces[link].top = -tGravity * mass; // ??? @@@ Yea, fix-base version has no minus here.
zAForces[link].bottom = vA.cross(I * vA);
}
__syncwarp();
for (PxU32 link = (linkCount - 1); link > 0; --link)
{
const PxTransform& body2World = arti.linkBody2Worlds[link];
const PxU32 parentLink = arti.parents[link];
const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink];
if (threadIndex == 0)
zAForces[parentLink] += translateSpatialVector(body2World.p - parentBody2World.p, zAForces[link]);
}
__syncwarp();
// Compute motion acceleration
if (threadIndex == 0)
{
Dy::SpatialMatrix invInertia = spatialInertias[0].invertInertia();
motionAccelerations[0] = -(invInertia * zAForces[0]);
}
for (PxU32 link = 1; link < linkCount; ++link)
{
const PxTransform& body2World = arti.linkBody2Worlds[link];
const PxU32 parentLink = arti.parents[link];
const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink];
if (threadIndex == 0)
{
motionAccelerations[link] = translateSpatialVector(parentBody2World.p - body2World.p, motionAccelerations[parentLink]);
zAForces[link] = spatialInertias[link] * motionAccelerations[link] + zAForces[link];
}
const ArticulationJointCoreData& jointData = arti.jointData[link];
PxReal* force = &gravityCompensationForces[jointData.jointOffset];
if (threadIndex < jointData.nbDof)
force[threadIndex] = link; // I'll just store dof's link index here and use it in the next loop
}
__syncwarp();
// Compute joint forces
for (PxU32 dof = threadIndex; dof < dofCount; dof += WARP_SIZE)
{
PxU32 link = (PxU32)gravityCompensationForces[dof];
const ArticulationJointCoreData& jointData = arti.jointData[link];
const PxTransform& body2World = arti.linkBody2Worlds[link];
Cm::UnAlignedSpatialVector dofMotion = arti.motionMatrix[link][dof - jointData.jointOffset].rotate(body2World);
gravityCompensationForces[dof] = dofMotion.innerProduct(zAForces[link]);
}
__syncwarp();
}
}
}
// This is an optimized port of FeatherstoneArticulation::getCoriolisAndCentrifugalForce.
// We process one articulation per warp (32 threads).
extern "C" __global__ void computeArtiCentrifugalForces(
const PxU32 nbIndices,
float* PX_RESTRICT data,
const PxArticulationGPUIndex* PX_RESTRICT gpuIndices,
const PxU32 maxDofs,
const bool rootMotion,
const PxgArticulation* articulations)
{
const PxU32 jobIndex = threadIdx.y + blockIdx.x * blockDim.y;
const PxU32 threadIndex = threadIdx.x;
if (jobIndex < nbIndices)
{
const PxU32 artiIndex = gpuIndices[jobIndex];
const PxgArticulation& arti = articulations[artiIndex];
Dy::SpatialMatrix* PX_RESTRICT spatialInertias = reinterpret_cast<Dy::SpatialMatrix*>(arti.worldSpatialArticulatedInertia);
PxReal* PX_RESTRICT jointVelocities = reinterpret_cast<PxReal*>(arti.jointVelocities);
Cm::UnAlignedSpatialVector* PX_RESTRICT motionVelocities = reinterpret_cast<Cm::UnAlignedSpatialVector*>(arti.motionVelocities);
Cm::UnAlignedSpatialVector* PX_RESTRICT corioliseVectors = reinterpret_cast<Cm::UnAlignedSpatialVector*>(arti.corioliseVectors);
// corioliseVectors is used as a tmp buffer to store motion accelerations of all links
// arti.motionAccelerations cannot be used because it stores important information that should not be erased
Cm::UnAlignedSpatialVector* PX_RESTRICT motionAccelerations = reinterpret_cast<Cm::UnAlignedSpatialVector*>(arti.corioliseVectors);
Cm::UnAlignedSpatialVector* PX_RESTRICT zAForces = reinterpret_cast<Cm::UnAlignedSpatialVector*>(arti.zAForces);
const PxU32 linkCount = arti.data.numLinks;
const PxU32 dofCount = arti.data.numJointDofs;
const bool fixBase = arti.data.flags & PxArticulationFlag::eFIX_BASE;
const PxU32 rootDof = (rootMotion && !fixBase) ? 6 : 0; // Add the DoF of the root in the floating base case
const PxU32 bufferDof = rootMotion ? 6 : 0;
float* PX_RESTRICT coriolisForces = &data[jobIndex * (maxDofs + bufferDof)];
// Velocities
// It seems to be unnecessary to recalculate the motion velocities as we always call
// the update kinematic function before calling this function
if (threadIndex == 0)
{
for (PxU32 link = 1; link < linkCount; ++link)
{
const PxTransform& body2World = arti.linkBody2Worlds[link];
const PxU32 parentLink = arti.parents[link];
const ArticulationJointCoreData& jointData = arti.jointData[link];
const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink];
const PxVec3 rw = body2World.p - parentBody2World.p;
const Cm::UnAlignedSpatialVector pVel = motionVelocities[parentLink];
Cm::UnAlignedSpatialVector vel = translateSpatialVector(-rw, pVel);
Cm::UnAlignedSpatialVector deltaV = Cm::UnAlignedSpatialVector::Zero();
for (PxU32 dof = 0; dof < jointData.nbDof; ++dof)
{
const PxReal maxJointVelocity = arti.joints[link].maxJointVelocity[dof];
const PxReal jointVelocity = jointVelocities[jointData.jointOffset + dof];
PxReal jVel = PxMin(jointVelocity, maxJointVelocity);
Cm::UnAlignedSpatialVector dofMotion = arti.motionMatrix[link][dof].rotate(body2World);
deltaV += dofMotion * jVel;
}
vel.top += deltaV.top;
vel.bottom += deltaV.bottom;
const PxVec3 aVec = deltaV.top;
const PxVec3 force = pVel.top.cross(aVec);
const PxVec3 lVel = deltaV.bottom;
const PxVec3 torque = pVel.top.cross(pVel.top.cross(rw)) + 2.f * pVel.top.cross(lVel) + aVec.cross(lVel);
corioliseVectors[link] = Cm::SpatialVectorF(force, torque);
motionVelocities[link] = vel;
}
}
__syncwarp();
if (threadIndex == 0)
{
motionAccelerations[0] = Cm::SpatialVectorF::Zero();
for (PxU32 link = 1; link < linkCount; ++link)
{
const PxTransform& body2World = arti.linkBody2Worlds[link];
const PxU32 parentLink = arti.parents[link];
const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink];
const PxVec3 rw = body2World.p - parentBody2World.p;
Cm::UnAlignedSpatialVector pMotionAcceleration = translateSpatialVector(-rw, motionAccelerations[parentLink]);
motionAccelerations[link] = pMotionAcceleration + corioliseVectors[link];
}
}
__syncwarp();
for (PxU32 link = threadIndex; link < linkCount; link += WARP_SIZE)
{
const PxTransform& body2World = arti.linkBody2Worlds[link];
const float4 invIM = arti.linkProps[link].invInertiaXYZ_invMass;
const PxReal mass = invIM.w == 0.f ? 0.f : (1.f / invIM.w);
const PxVec3 localInertia = PxVec3(invIM.x == 0.f ? 0.f : (1.f / invIM.x), invIM.y == 0.f ? 0.f : (1.f / invIM.y), invIM.z == 0.f ? 0.f : (1.f / invIM.z));
PxMat33 I; Cm::transformInertiaTensor(localInertia, PxMat33(body2World.q), I);
const PxVec3& vA = motionVelocities[link].top;
zAForces[link].top = motionAccelerations[link].bottom * mass;
zAForces[link].bottom = vA.cross(I * vA) + I * motionAccelerations[link].top;
}
__syncwarp();
if (threadIndex == 0)
{
for (PxU32 link = (linkCount - 1); link > 0; --link)
{
const PxTransform& body2World = arti.linkBody2Worlds[link];
const PxU32 parentLink = arti.parents[link];
const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink];
zAForces[parentLink] += translateSpatialVector(body2World.p - parentBody2World.p, zAForces[link]);
const ArticulationJointCoreData& jointData = arti.jointData[link];
PxReal* force = &coriolisForces[jointData.jointOffset];
for (PxU32 dof = 0; dof < jointData.nbDof; ++dof)
force[dof + rootDof] = link; // I'll just store dof's link index here and use it in the last loop
}
}
__syncwarp();
if (!fixBase)
{
// Compute links inertias
for (PxU32 link = threadIndex; link < linkCount; link += WARP_SIZE)
{
const float4 invIM = arti.linkProps[link].invInertiaXYZ_invMass;
const PxReal mass = invIM.w == 0.f ? 0.f : (1.f / invIM.w);
const PxVec3 localInertia = PxVec3(invIM.x == 0.f ? 0.f : (1.f / invIM.x), invIM.y == 0.f ? 0.f : (1.f / invIM.y), invIM.z == 0.f ? 0.f : (1.f / invIM.z));
Dy::SpatialMatrix spatialInertia;
spatialInertia.topLeft = PxMat33(PxZero);
spatialInertia.topRight = PxMat33::createDiagonal(PxVec3(mass));
Cm::transformInertiaTensor(localInertia, PxMat33(arti.linkBody2Worlds[link].q), spatialInertia.bottomLeft);
spatialInertias[link] = spatialInertia;
}
__syncwarp();
if (threadIndex == 0)
{
for (PxU32 link = linkCount - 1; link > 0; --link)
{
const PxTransform& body2World = arti.linkBody2Worlds[link];
const PxU32 parentLink = arti.parents[link];
const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink];
const PxVec3 rw = body2World.p - parentBody2World.p;
PxMat33 skewSymmetric(PxVec3(0, rw.z, -rw.y), PxVec3(-rw.z, 0, rw.x), PxVec3(rw.y, -rw.x, 0));
Dy::SpatialMatrix parentSpaceSpatialInertia = spatialInertias[link];
translateInertia(skewSymmetric, parentSpaceSpatialInertia);
spatialInertias[parentLink] += parentSpaceSpatialInertia;
}
}
__syncwarp();
// Compute motion acceleration
if (threadIndex == 0)
{
if (!rootMotion)
{
Dy::SpatialMatrix invInertia = spatialInertias[0].invertInertia();
motionAccelerations[0] = -(invInertia * zAForces[0]);
}
for (PxU32 link = 1; link < linkCount; ++link)
{
const PxTransform& body2World = arti.linkBody2Worlds[link];
const PxU32 parentLink = arti.parents[link];
const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink];
motionAccelerations[link] = translateSpatialVector(parentBody2World.p - body2World.p, motionAccelerations[parentLink]);
zAForces[link] = spatialInertias[link] * motionAccelerations[link] + zAForces[link];
}
}
__syncwarp();
}
// Compute joint forces
for (PxU32 dof = threadIndex; dof < dofCount; dof += WARP_SIZE)
{
PxU32 link = (PxU32)coriolisForces[dof + rootDof];
const ArticulationJointCoreData& jointData = arti.jointData[link];
const PxTransform& body2World = arti.linkBody2Worlds[link];
Cm::UnAlignedSpatialVector dofMotion = arti.motionMatrix[link][dof - jointData.jointOffset].rotate(body2World);
coriolisForces[dof + rootDof] = dofMotion.innerProduct(zAForces[link]);
}
// Add root contribution
if (rootMotion && threadIndex == 0 && !fixBase)
{
coriolisForces[0] = zAForces[0].top.x;
coriolisForces[1] = zAForces[0].top.y;
coriolisForces[2] = zAForces[0].top.z;
coriolisForces[3] = zAForces[0].bottom.x;
coriolisForces[4] = zAForces[0].bottom.y;
coriolisForces[5] = zAForces[0].bottom.z;
}
__syncwarp();
}
}
// This is an optimized port of FeatherstoneArticulation::getArticulationCOM
// We process one articulation per warp (32 threads) and one link per thread.
// We have to separate the calculation of the total mass from the calculation
// of the sum of COM in case there are more than 32 links in one articulation.
extern "C" __global__ void computeArtiCOM(
PxVec3* PX_RESTRICT data,
const PxArticulationGPUIndex* PX_RESTRICT gpuIndices,
const bool rootFrame,
const PxgArticulation* PX_RESTRICT articulations)
{
const PxU32 jobIndex = threadIdx.y + blockIdx.x * blockDim.y;
const PxU32 threadIndex = threadIdx.x;
const PxU32 artiIndex = gpuIndices[jobIndex];
const PxgArticulation& arti = articulations[artiIndex];
PxVec3* articulationCOM = &data[jobIndex];
const PxU32 linkCount = arti.data.numLinks;
// calculate the total mass of the articulation and store it temporarily in articulationCOM->x
for(PxU32 link = threadIndex; link < linkCount; link += WARP_SIZE)
{
const PxReal mass = 1.0f / arti.linkProps[link].invInertiaXYZ_invMass.w;
// calculate the total mass and store it temporarily in articulationCOM->x
PxRedAddGlobal(&articulationCOM->x, mass);
}
__syncwarp();
PxReal totalMass = articulationCOM->x;
// calculate the COM as the sum of body COM
for(PxU32 link = threadIndex; link < linkCount; link += WARP_SIZE)
{
const PxVec3 childPose = arti.linkBody2Worlds[link].p;
const PxReal mass = 1.0f / arti.linkProps[link].invInertiaXYZ_invMass.w;
const PxVec3 COM = childPose * mass;
// calculate the COM as the sum of body COM
PxRedAddGlobal(&articulationCOM->x, COM[0]);
PxRedAddGlobal(&articulationCOM->y, COM[1]);
PxRedAddGlobal(&articulationCOM->z, COM[2]);
}
if(threadIndex == 0)
{
// remove the total mass from articulationCOM->x
articulationCOM->x -= totalMass;
*articulationCOM /= totalMass;
// move to the root frame
if(rootFrame)
{
*articulationCOM = arti.linkBody2Worlds[0].getInverse().transform(*articulationCOM);
}
}
__syncwarp();
}
// This is an optimized port of FeatherstoneArticulation::getCentroidalMomentumMatrix
// We process one articulation per warp (32 threads) and one link per thread.
extern "C" __global__ void computeArtiCentroidalMomentumMatrices(
float* PX_RESTRICT data,
const PxArticulationGPUIndex* PX_RESTRICT gpuIndices,
const PxU32 nbIndices,
const PxU32 maxDofs,
const PxgArticulation* PX_RESTRICT articulations)
{
const PxU32 jobIndex = threadIdx.y + blockIdx.x * blockDim.y;
const PxU32 threadIndex = threadIdx.x;
if(jobIndex < nbIndices)
{
const PxU32 artiIndex = gpuIndices[jobIndex];
const PxgArticulation& arti = articulations[artiIndex];
const PxU32 dofCount = arti.data.numJointDofs;
const bool fixBase = arti.data.flags & PxArticulationFlag::eFIX_BASE;
if(!fixBase)
{
const PxU32 startCoriolisForces = (maxDofs + 6) * (maxDofs + 6) * nbIndices;
const PxU32 startCentroidalMomentumMatrices = startCoriolisForces + (maxDofs + 6) * nbIndices;
const PxU32 startCentroidalMomentumBias = startCentroidalMomentumMatrices + 6 * (maxDofs + 6) * nbIndices;
float* PX_RESTRICT massMatrices = &data[jobIndex * (maxDofs + 6) * (maxDofs + 6)];
float* PX_RESTRICT coriolisForces = &data[startCoriolisForces + jobIndex * (maxDofs + 6)];
float* PX_RESTRICT centroidalMomentumMatrix = &data[startCentroidalMomentumMatrices + jobIndex * 6 * (maxDofs + 6)];
float* PX_RESTRICT centroidalMomentumBias = &data[startCentroidalMomentumBias + jobIndex * 6];
// adding mass matrix terms corresponding to the root DoFs
for(PxU32 j = threadIndex; j < 6 * (dofCount + 6); j += WARP_SIZE)
{
centroidalMomentumMatrix[j] = massMatrices[j];
}
// calculating the COM momentum from the spatial articulated inertia of the root in the mass matrix
const PxReal totalMass = massMatrices[2 * (dofCount + 6) + 2];
const PxVec3 pg = PxVec3(massMatrices[2 * (dofCount + 6) + 4], massMatrices[5], massMatrices[(dofCount + 6) + 3]) / totalMass;
const PxMat33 mat(PxVec3(0, pg.z, -pg.y), PxVec3(-pg.z, 0, pg.x), PxVec3(pg.y, -pg.x, 0));
// adding remaining contributions
for(PxU32 col = threadIndex; col < dofCount + 6; col += WARP_SIZE)
{
const PxVec3 linMassMatrices = PxVec3(massMatrices[0 * (dofCount + 6) + col], massMatrices[1 * (dofCount + 6) + col],
massMatrices[2 * (dofCount + 6) + col]);
const PxVec3 crossTermsCentroidalMomentumMatrix = mat * linMassMatrices;
for(PxU32 row = 3; row < 6; ++row)
{
centroidalMomentumMatrix[col + row * (dofCount + 6)] += crossTermsCentroidalMomentumMatrix[row - 3];
}
}
if(threadIndex == 0)
{
// adding coriolis force terms corresponding to the root DoFs
for(PxU32 row = 0; row < 6; ++row)
{
centroidalMomentumBias[row] = coriolisForces[row];
}
// adding remaining contributions
const PxVec3 linCoriolisForces = PxVec3(coriolisForces[0], coriolisForces[1], coriolisForces[2]);
const PxVec3 crossTermsCentroidalMomentumBias = mat * linCoriolisForces;
for(PxU32 row = 3; row < 6; ++row)
{
centroidalMomentumBias[row] += crossTermsCentroidalMomentumBias[row - 3];
}
}
}
}
}