1150 lines
44 KiB
Plaintext
1150 lines
44 KiB
Plaintext
// Redistribution and use in source and binary forms, with or without
|
|
// modification, are permitted provided that the following conditions
|
|
// are met:
|
|
// * Redistributions of source code must retain the above copyright
|
|
// notice, this list of conditions and the following disclaimer.
|
|
// * Redistributions in binary form must reproduce the above copyright
|
|
// notice, this list of conditions and the following disclaimer in the
|
|
// documentation and/or other materials provided with the distribution.
|
|
// * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
// contributors may be used to endorse or promote products derived
|
|
// from this software without specific prior written permission.
|
|
//
|
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
|
|
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
//
|
|
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
|
|
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
|
|
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
|
|
|
|
|
|
#include "PxgArticulationLink.h"
|
|
#include "DyArticulationJointCore.h"
|
|
#include "DyFeatherstoneArticulationUtils.h"
|
|
#include "DyFeatherstoneArticulationJointData.h"
|
|
#include "DyArticulationCore.h"
|
|
#include "PxgArticulationCoreKernelIndices.h"
|
|
#include "PxgBodySim.h"
|
|
#include "reduction.cuh"
|
|
#include "copy.cuh"
|
|
#include "articulationDynamic.cuh"
|
|
#include "articulationImpulseResponse.cuh"
|
|
#include "MemoryAllocator.cuh"
|
|
#include "PxSpatialMatrix.h"
|
|
#include "foundation/PxMath.h"
|
|
#include "DyFeatherstoneArticulation.h"
|
|
#include "PxConstraintDesc.h"
|
|
#include "DyConstraintPrep.h"
|
|
#include "PxsRigidBody.h"
|
|
#include <stdio.h>
|
|
|
|
using namespace physx;
|
|
using namespace Dy;
|
|
|
|
extern "C" __host__ void initArticulationKernels4() {}
|
|
|
|
// VR: Old version for reference
|
|
#if 0
|
|
// This is a simple port of FeatherstoneArticulation::getDenseJacobian.
|
|
// We process one articulation per thread, which is not very efficient.
|
|
extern "C" __global__ void computeArtiDenseJacobians(
|
|
const PxIndexDataPair* indexMap,
|
|
const PxU32 nbIndices,
|
|
const PxgArticulation* articulations)
|
|
{
|
|
const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
|
|
|
|
if (globalThreadIndex < nbIndices)
|
|
{
|
|
PxU32 artiIndex = indexMap[globalThreadIndex].index;
|
|
const PxgArticulation& arti = articulations[artiIndex];
|
|
|
|
PxU32 linkCount = arti.data.numLinks;
|
|
//PxU32 jointCount = linkCount - 1;
|
|
PxU32 dofCount = arti.data.numJointDofs;
|
|
bool fixBase = arti.data.flags & PxArticulationFlag::eFIX_BASE;
|
|
|
|
PxU32 nCols = (fixBase ? 0 : 6) + dofCount;
|
|
//PxU32 nRows = (fixBase ? 0 : 6) + jointCount * 6;
|
|
|
|
//printf("~!~!~! processing arti %u (%u): %u links, %u dofs, fixed? %u, %u rows, %u cols\n",
|
|
// globalThreadIndex, artiIndex, linkCount, dofCount, PxU32(fixBase), nRows, nCols);
|
|
|
|
float* dataPtr = static_cast<float*>(indexMap[globalThreadIndex].data);
|
|
|
|
#define jacobian(row, col) dataPtr[nCols * (row) + (col)]
|
|
|
|
PxU32 destRow = 0;
|
|
PxU32 destCol = 0;
|
|
|
|
if (!fixBase)
|
|
{
|
|
jacobian(0, 0) = 1.0f;
|
|
jacobian(0, 1) = 0.0f;
|
|
jacobian(0, 2) = 0.0f;
|
|
jacobian(0, 3) = 0.0f;
|
|
jacobian(0, 4) = 0.0f;
|
|
jacobian(0, 5) = 0.0f;
|
|
|
|
jacobian(1, 0) = 0.0f;
|
|
jacobian(1, 1) = 1.0f;
|
|
jacobian(1, 2) = 0.0f;
|
|
jacobian(1, 3) = 0.0f;
|
|
jacobian(1, 4) = 0.0f;
|
|
jacobian(1, 5) = 0.0f;
|
|
|
|
jacobian(2, 0) = 0.0f;
|
|
jacobian(2, 1) = 0.0f;
|
|
jacobian(2, 2) = 1.0f;
|
|
jacobian(2, 3) = 0.0f;
|
|
jacobian(2, 4) = 0.0f;
|
|
jacobian(2, 5) = 0.0f;
|
|
|
|
jacobian(3, 0) = 0.0f;
|
|
jacobian(3, 1) = 0.0f;
|
|
jacobian(3, 2) = 0.0f;
|
|
jacobian(3, 3) = 1.0f;
|
|
jacobian(3, 4) = 0.0f;
|
|
jacobian(3, 5) = 0.0f;
|
|
|
|
jacobian(4, 0) = 0.0f;
|
|
jacobian(4, 1) = 0.0f;
|
|
jacobian(4, 2) = 0.0f;
|
|
jacobian(4, 3) = 0.0f;
|
|
jacobian(4, 4) = 1.0f;
|
|
jacobian(4, 5) = 0.0f;
|
|
|
|
jacobian(5, 0) = 0.0f;
|
|
jacobian(5, 1) = 0.0f;
|
|
jacobian(5, 2) = 0.0f;
|
|
jacobian(5, 3) = 0.0f;
|
|
jacobian(5, 4) = 0.0f;
|
|
jacobian(5, 5) = 1.0f;
|
|
|
|
destRow += 6;
|
|
destCol += 6;
|
|
}
|
|
|
|
for (PxU32 linkID = 1; linkID < linkCount; ++linkID)//each iteration of this writes 6 rows in the matrix
|
|
{
|
|
//const PxgArticulationLink& link = arti.links[linkID];
|
|
const PxTransform& body2World = arti.linkBody2Worlds[linkID];
|
|
const ArticulationJointCoreData& jointDatum = arti.jointData[linkID];
|
|
const PxU32 parentLinkID = arti.parents[linkID];
|
|
|
|
if (parentLinkID || !fixBase)
|
|
{
|
|
// VR: It arti.jointData[0] isn't initialized
|
|
//const PxU32 parentsFirstDestCol = arti.jointData[parentLinkID].jointOffset + (fixBase ? 0 : 6);
|
|
const PxU32 parentsLastDestCol = parentLinkID ? arti.jointData[parentLinkID].jointOffset + (fixBase ? 0 : 6) + arti.jointData[parentLinkID].dof : 6;
|
|
|
|
// VR: For parentLinkID == 0, it's two unsigned it overflows, though the result is still correct.
|
|
const PxU32 parentsDestRow = (fixBase ? 0 : 6) + (parentLinkID - 1) * 6;
|
|
|
|
// no commonInit, so need to compute the rw vector here
|
|
const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLinkID];
|
|
const PxVec3 rw = body2World.p - parentBody2World.p;
|
|
|
|
// VR: parentsLastDestCol is rather column count than the last column index
|
|
//for (PxU32 col = 0; col <= parentsLastDestCol; col++)
|
|
for (PxU32 col = 0; col < parentsLastDestCol; col++)
|
|
{
|
|
//copy downward the 6 cols from parent
|
|
const PxVec3 parentAng(jacobian(parentsDestRow + 3, col),
|
|
jacobian(parentsDestRow + 4, col),
|
|
jacobian(parentsDestRow + 5, col));
|
|
|
|
const PxVec3 parentAngxRw = parentAng.cross(rw);
|
|
|
|
jacobian(destRow + 0, col) = jacobian(parentsDestRow + 0, col) + parentAngxRw.x;
|
|
jacobian(destRow + 1, col) = jacobian(parentsDestRow + 1, col) + parentAngxRw.y;
|
|
jacobian(destRow + 2, col) = jacobian(parentsDestRow + 2, col) + parentAngxRw.z;
|
|
|
|
jacobian(destRow + 3, col) = parentAng.x;
|
|
jacobian(destRow + 4, col) = parentAng.y;
|
|
jacobian(destRow + 5, col) = parentAng.z;
|
|
}
|
|
|
|
// VR: parentsLastDestCol is rather column count than the last column index
|
|
//for (PxU32 col = parentsLastDestCol + 1; col < destCol; col++)
|
|
for (PxU32 col = parentsLastDestCol; col < destCol; col++)
|
|
{
|
|
//fill with zeros.
|
|
jacobian(destRow + 0, col) = 0.0f;
|
|
jacobian(destRow + 1, col) = 0.0f;
|
|
jacobian(destRow + 2, col) = 0.0f;
|
|
|
|
jacobian(destRow + 3, col) = 0.0f;
|
|
jacobian(destRow + 4, col) = 0.0f;
|
|
jacobian(destRow + 5, col) = 0.0f;
|
|
}
|
|
}
|
|
|
|
//diagonal block:
|
|
for (PxU32 ind = 0; ind < jointDatum.dof; ++ind)
|
|
{
|
|
const Cm::UnAlignedSpatialVector& v = arti.motionMatrix[linkID][ind];
|
|
|
|
const PxVec3 ang = body2World.rotate(v.top);
|
|
const PxVec3 lin = body2World.rotate(v.bottom);
|
|
|
|
jacobian(destRow + 0, destCol) = lin.x;
|
|
jacobian(destRow + 1, destCol) = lin.y;
|
|
jacobian(destRow + 2, destCol) = lin.z;
|
|
|
|
jacobian(destRow + 3, destCol) = ang.x;
|
|
jacobian(destRow + 4, destCol) = ang.y;
|
|
jacobian(destRow + 5, destCol) = ang.z;
|
|
|
|
destCol++;
|
|
}
|
|
|
|
//above diagonal block: always zero
|
|
for (PxU32 col = destCol; col < nCols; col++)
|
|
{
|
|
jacobian(destRow + 0, col) = 0.0f;
|
|
jacobian(destRow + 1, col) = 0.0f;
|
|
jacobian(destRow + 2, col) = 0.0f;
|
|
|
|
jacobian(destRow + 3, col) = 0.0f;
|
|
jacobian(destRow + 4, col) = 0.0f;
|
|
jacobian(destRow + 5, col) = 0.0f;
|
|
}
|
|
|
|
destRow += 6;
|
|
}
|
|
#undef jacobian
|
|
}
|
|
}
|
|
#endif
|
|
|
|
extern "C" __global__ void computeArtiDenseJacobians(
|
|
float* PX_RESTRICT data,
|
|
const PxArticulationGPUIndex* PX_RESTRICT gpuIndices,
|
|
const PxU32 nbIndices,
|
|
const PxU32 maxLinks,
|
|
const PxU32 maxDofs,
|
|
const PxgArticulation* PX_RESTRICT articulations)
|
|
{
|
|
const PxU32 jobIndex = threadIdx.y + blockIdx.x * blockDim.y;
|
|
const PxU32 threadIndex = threadIdx.x;
|
|
|
|
if (jobIndex < nbIndices)
|
|
{
|
|
const PxArticulationGPUIndex artiIndex = gpuIndices[jobIndex];
|
|
const PxgArticulation& arti = articulations[artiIndex];
|
|
|
|
const PxU32 linkCount = arti.data.numLinks;
|
|
const PxU32 dofCount = arti.data.numJointDofs;
|
|
const bool fixedBase = arti.data.flags & PxArticulationFlag::eFIX_BASE;
|
|
const PxU32 baseDofs = fixedBase ? 0 : 6;
|
|
const PxU32 colCount = baseDofs + dofCount;
|
|
|
|
const PxU32 maxCols = 6 + maxDofs;
|
|
const PxU32 maxRows = 6 + (maxLinks - 1) * 6;
|
|
|
|
float* PX_RESTRICT jacobian = &data[jobIndex * maxCols * maxRows];
|
|
|
|
if (!fixedBase)
|
|
{
|
|
for (PxU32 i = threadIndex; i < 6 * colCount; i += WARP_SIZE)
|
|
jacobian[i] = (i / colCount == i % colCount) ? 1.0f : 0.0f;
|
|
|
|
__syncwarp();
|
|
}
|
|
|
|
for (PxU32 link = 1; link < linkCount; ++link)
|
|
{
|
|
const PxU32 linkOffset = arti.jointData[link].jointOffset + baseDofs;
|
|
const PxU32 linkDofs = arti.jointData[link].nbDof;
|
|
const PxU32 linkRow0 = (link - 1) * 6 + baseDofs;
|
|
const PxTransform& body2World = arti.linkBody2Worlds[link];
|
|
const PxU32 parentLink = arti.parents[link];
|
|
const PxU32 parentDofs = parentLink ? arti.jointData[parentLink].jointOffset + arti.jointData[parentLink].nbDof + baseDofs : baseDofs;
|
|
|
|
for (PxU32 dof = threadIndex; dof < colCount; dof += WARP_SIZE)
|
|
{
|
|
PxVec3 linkLin(0), linkAng(0);
|
|
|
|
if (dof < parentDofs)
|
|
{
|
|
const PxU32 parentRow0 = parentLink ? (parentLink - 1) * 6 + baseDofs : 0;
|
|
|
|
const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink];
|
|
const PxVec3 rw = body2World.p - parentBody2World.p;
|
|
|
|
const PxVec3 parentLin(jacobian[(parentRow0 + 0) * colCount + dof],
|
|
jacobian[(parentRow0 + 1) * colCount + dof],
|
|
jacobian[(parentRow0 + 2) * colCount + dof]);
|
|
|
|
const PxVec3 parentAng(jacobian[(parentRow0 + 3) * colCount + dof],
|
|
jacobian[(parentRow0 + 4) * colCount + dof],
|
|
jacobian[(parentRow0 + 5) * colCount + dof]);
|
|
|
|
const PxVec3 parentAngxRw = parentAng.cross(rw);
|
|
|
|
linkLin = parentLin + parentAngxRw;
|
|
linkAng = parentAng;
|
|
}
|
|
else if (dof >= linkOffset && dof < linkOffset + linkDofs)
|
|
{
|
|
const Cm::UnAlignedSpatialVector& v = arti.motionMatrix[link][dof - linkOffset];
|
|
|
|
linkLin = body2World.rotate(v.bottom);
|
|
linkAng = body2World.rotate(v.top);
|
|
}
|
|
|
|
__syncwarp();
|
|
|
|
jacobian[(linkRow0 + 0) * colCount + dof] = linkLin.x;
|
|
jacobian[(linkRow0 + 1) * colCount + dof] = linkLin.y;
|
|
jacobian[(linkRow0 + 2) * colCount + dof] = linkLin.z;
|
|
|
|
jacobian[(linkRow0 + 3) * colCount + dof] = linkAng.x;
|
|
jacobian[(linkRow0 + 4) * colCount + dof] = linkAng.y;
|
|
jacobian[(linkRow0 + 5) * colCount + dof] = linkAng.z;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
__device__ static void translateInertia(const PxMat33& sTod, Dy::SpatialMatrix& inertia)
|
|
{
|
|
const PxMat33 dTos = sTod.getTranspose();
|
|
|
|
PxMat33 bl = sTod * inertia.topLeft + inertia.bottomLeft;
|
|
PxMat33 br = sTod * inertia.topRight + inertia.getBottomRight();
|
|
|
|
inertia.topLeft = inertia.topLeft + inertia.topRight * dTos;
|
|
inertia.bottomLeft = bl + br * dTos;
|
|
|
|
//aligned inertia - make it symmetrical! OPTIONAL!!!!
|
|
inertia.bottomLeft = (inertia.bottomLeft + inertia.bottomLeft.getTranspose()) * 0.5f;
|
|
}
|
|
|
|
// This is a simple port of FeatherstoneArticulation::getGeneralizedMassMatrixCRB.
|
|
// I process one articulation per warp (32 threads).
|
|
extern "C" __global__ void computeArtiMassMatrices(
|
|
const PxU32 nbIndices,
|
|
float* PX_RESTRICT data,
|
|
const PxArticulationGPUIndex* PX_RESTRICT gpuIndices,
|
|
const PxU32 maxDofs,
|
|
const bool rootMotion,
|
|
const PxgArticulation* PX_RESTRICT articulations)
|
|
{
|
|
const PxU32 jobIndex = threadIdx.y + blockIdx.x * blockDim.y;
|
|
const PxU32 threadIndex = threadIdx.x;
|
|
|
|
if (jobIndex < nbIndices)
|
|
{
|
|
const PxU32 artiIndex = gpuIndices[jobIndex];
|
|
const PxgArticulation& arti = articulations[artiIndex];
|
|
|
|
// I use this as a tmp buffer to store world space spatial inertias of all links
|
|
Dy::SpatialMatrix* PX_RESTRICT spatialInertias = reinterpret_cast<Dy::SpatialMatrix*>(arti.worldSpatialArticulatedInertia); // ???
|
|
// I use this as a tmp buffer to store world space motiom vectors multiplied by their links inertias
|
|
Cm::UnAlignedSpatialVector* PX_RESTRICT worldMotionMatrices = reinterpret_cast<Cm::UnAlignedSpatialVector*>(arti.worldMotionMatrix); // ???
|
|
|
|
const PxU32 linkCount = arti.data.numLinks;
|
|
const PxU32 dofCount = arti.data.numJointDofs;
|
|
const bool fixBase = arti.data.flags & PxArticulationFlag::eFIX_BASE;
|
|
const PxU32 rootDof = (rootMotion && !fixBase) ? 6 : 0; // Add the DoF of the root in the floating base case
|
|
const PxU32 bufferDof = rootMotion ? 6 : 0;
|
|
const PxU32 matSize = dofCount + rootDof;
|
|
|
|
float* massMatrix = &data[jobIndex * (maxDofs + bufferDof) * (maxDofs + bufferDof)];
|
|
for (int i = threadIndex; i < matSize * matSize; i += WARP_SIZE)
|
|
massMatrix[i] = 0;
|
|
|
|
__syncwarp();
|
|
|
|
for (PxU32 link = threadIndex; link < linkCount; link += WARP_SIZE)
|
|
{
|
|
const float4 invIM = arti.linkProps[link].invInertiaXYZ_invMass;
|
|
const PxReal mass = invIM.w == 0.f ? 0.f : (1.f / invIM.w);
|
|
const PxVec3 localInertia = PxVec3(invIM.x == 0.f ? 0.f : (1.f / invIM.x),
|
|
invIM.y == 0.f ? 0.f : (1.f / invIM.y),
|
|
invIM.z == 0.f ? 0.f : (1.f / invIM.z));
|
|
Dy::SpatialMatrix spatialInertia;
|
|
spatialInertia.topLeft = PxMat33(PxZero);
|
|
spatialInertia.topRight = PxMat33::createDiagonal(PxVec3(mass));
|
|
spatialInertia.padding = 0; // VR: I use this to store the order to compute combined inertias
|
|
Cm::transformInertiaTensor(localInertia, PxMat33(arti.linkBody2Worlds[link].q), spatialInertia.bottomLeft);
|
|
spatialInertias[link] = spatialInertia;
|
|
}
|
|
|
|
__syncwarp();
|
|
|
|
#if 1 // VR: This optimization allows computing some of combined inertias in parallel. It's a bit faster for humanoids, but not much.
|
|
PxU32 maxStep = 0;
|
|
if (threadIndex == 0)
|
|
{
|
|
for (PxU32 link = linkCount - 1; link > 0; --link)
|
|
{
|
|
PxU32& s = spatialInertias[link].padding;
|
|
const PxU32 parentLink = arti.parents[link];
|
|
PxU32& ps = spatialInertias[parentLink].padding;
|
|
if (s < ps) s = ps;
|
|
ps = s + 1;
|
|
if (maxStep < ps) maxStep = ps;
|
|
}
|
|
}
|
|
__syncwarp();
|
|
maxStep = __shfl_sync(0xffffffff, maxStep, 0);
|
|
for (PxU32 step = 0; step <= maxStep; ++step)
|
|
{
|
|
for (PxU32 link = threadIndex + 1; link < linkCount; link += WARP_SIZE)
|
|
{
|
|
const PxU32& s = spatialInertias[link].padding;
|
|
if (s == step)
|
|
{
|
|
const PxTransform& body2World = arti.linkBody2Worlds[link];
|
|
const PxU32 parentLink = arti.parents[link];
|
|
const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink];
|
|
const PxVec3 rw = body2World.p - parentBody2World.p;
|
|
PxMat33 skewSymmetric(PxVec3(0, rw.z, -rw.y), PxVec3(-rw.z, 0, rw.x), PxVec3(rw.y, -rw.x, 0));
|
|
Dy::SpatialMatrix parentSpaceSpatialInertia = spatialInertias[link];
|
|
translateInertia(skewSymmetric, parentSpaceSpatialInertia);
|
|
spatialInertias[parentLink] += parentSpaceSpatialInertia;
|
|
}
|
|
}
|
|
__syncwarp();
|
|
}
|
|
#else
|
|
if (threadIndex == 0) // @@@ ??? Can I optimize it somehow?
|
|
{
|
|
for (PxU32 link = linkCount - 1; link > 0; --link)
|
|
{
|
|
const PxTransform& body2World = arti.linkBody2Worlds[link];
|
|
const PxU32 parentLink = arti.parents[link];
|
|
const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink];
|
|
const PxVec3 rw = body2World.p - parentBody2World.p;
|
|
PxMat33 skewSymmetric(PxVec3(0, rw.z, -rw.y), PxVec3(-rw.z, 0, rw.x), PxVec3(rw.y, -rw.x, 0));
|
|
Dy::SpatialMatrix parentSpaceSpatialInertia = spatialInertias[link];
|
|
translateInertia(skewSymmetric, parentSpaceSpatialInertia);
|
|
spatialInertias[parentLink] += parentSpaceSpatialInertia;
|
|
}
|
|
}
|
|
|
|
__syncwarp();
|
|
#endif
|
|
// Loop is for the case where there are more than 32 links in one articulation
|
|
for (PxU32 link = threadIndex + 1; link < linkCount; link += WARP_SIZE)
|
|
{
|
|
const PxTransform& body2World = arti.linkBody2Worlds[link];
|
|
const ArticulationJointCoreData& jointData = arti.jointData[link];
|
|
const Dy::SpatialMatrix& spatialInertia = spatialInertias[link];
|
|
|
|
Cm::UnAlignedSpatialVector worldMotionMatrix[3];
|
|
Cm::UnAlignedSpatialVector* spatialInertia_worldMotionMatrix = &worldMotionMatrices[jointData.jointOffset];
|
|
for (PxU32 dof = 0; dof < jointData.nbDof; ++dof)
|
|
{
|
|
const Cm::UnAlignedSpatialVector& m = arti.motionMatrix[link][dof];
|
|
worldMotionMatrix[dof] = m.rotate(body2World);
|
|
spatialInertia_worldMotionMatrix[dof] = spatialInertia * worldMotionMatrix[dof];
|
|
}
|
|
|
|
// A unit joint acceleration is assumed for each dof of the considered link
|
|
// This calculates the diagonal terms using the spatial articulated inertia for obtaining
|
|
// the force applied to the considered dof due to the applied unit joint acceleration
|
|
for (PxU32 dof0 = 0; dof0 < jointData.nbDof; ++dof0)
|
|
{
|
|
const Cm::UnAlignedSpatialVector& Im0 = spatialInertia_worldMotionMatrix[dof0];
|
|
const PxU32 row = jointData.jointOffset + dof0 + rootDof;
|
|
for (PxU32 dof1 = 0; dof1 < jointData.nbDof; ++dof1)
|
|
{
|
|
const Cm::UnAlignedSpatialVector& m1 = worldMotionMatrix[dof1];
|
|
const PxU32 col = jointData.jointOffset + dof1 + rootDof;
|
|
massMatrix[row * matSize + col] = m1.innerProduct(Im0);
|
|
}
|
|
}
|
|
|
|
// The calculated force is then propagated inward to calculate the above diagonal terms of the mass matrix
|
|
// This is sufficient as the mass matrix is symmetric
|
|
PxTransform childBody2World = body2World;
|
|
PxU32 parentLink = arti.parents[link];
|
|
while (true)
|
|
{
|
|
const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink];
|
|
|
|
// The zero acceleration force is propagated inward
|
|
// This is correct because all links inward the considered link have no joint acceleration
|
|
const PxVec3 rw = childBody2World.p - parentBody2World.p;
|
|
for (PxU32 dof = 0; dof < jointData.nbDof; ++dof)
|
|
{
|
|
spatialInertia_worldMotionMatrix[dof] = FeatherstoneArticulation::translateSpatialVector(rw, spatialInertia_worldMotionMatrix[dof]);
|
|
}
|
|
childBody2World = parentBody2World;
|
|
|
|
if (parentLink == 0)
|
|
{
|
|
// For floating base, calculate the resulting force on the root
|
|
if (!fixBase && rootMotion)
|
|
{
|
|
for (PxU32 dof0 = 0; dof0 < jointData.nbDof; ++dof0)
|
|
{
|
|
const Cm::UnAlignedSpatialVector& Im0 = spatialInertia_worldMotionMatrix[dof0];
|
|
const PxU32 col = jointData.jointOffset + dof0 + rootDof;
|
|
for (PxU32 row = 0; row < 6; ++row)
|
|
{
|
|
massMatrix[col * matSize + row] += Im0[row];
|
|
massMatrix[col + row * matSize] += Im0[row];
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
|
|
const ArticulationJointCoreData& parentJointData = arti.jointData[parentLink];
|
|
|
|
Cm::UnAlignedSpatialVector parentWorldMotionMatrix[3];
|
|
for (PxU32 dof = 0; dof < parentJointData.nbDof; ++dof)
|
|
{
|
|
const Cm::UnAlignedSpatialVector& m = arti.motionMatrix[parentLink][dof];
|
|
parentWorldMotionMatrix[dof].top = parentBody2World.rotate(m.top);
|
|
parentWorldMotionMatrix[dof].bottom = parentBody2World.rotate(m.bottom);
|
|
}
|
|
|
|
// The joint force is calculated from the zero acceleration force
|
|
for (PxU32 dof0 = 0; dof0 < jointData.nbDof; ++dof0)
|
|
{
|
|
const Cm::UnAlignedSpatialVector& Im0 = spatialInertia_worldMotionMatrix[dof0];
|
|
const PxU32 row = jointData.jointOffset + dof0 + rootDof;
|
|
for (PxU32 dof1 = 0; dof1 < parentJointData.nbDof; ++dof1)
|
|
{
|
|
const Cm::UnAlignedSpatialVector& m1 = parentWorldMotionMatrix[dof1];
|
|
const PxU32 col = parentJointData.jointOffset + dof1 + rootDof;
|
|
float m = m1.innerProduct(Im0);
|
|
massMatrix[row * matSize + col] = m;
|
|
massMatrix[row + col * matSize] = m;
|
|
}
|
|
}
|
|
|
|
parentLink = arti.parents[parentLink];
|
|
}
|
|
}
|
|
|
|
// Adding the spatial articulated inertia of the root
|
|
if (threadIndex == 0 && !fixBase && rootMotion)
|
|
{
|
|
// Note that the spatial articulated inertia assumes that the root angular acceleration comes first,
|
|
// while the mass matrix assumes that the root linear acceleration comes first
|
|
// We have therefore to invert the angular and linear component of the spatial articulated inertia
|
|
// This also ensures that the mass matrix is symmetric
|
|
const PxReal* rootSpatialInertia = reinterpret_cast<PxReal*>(&spatialInertias[0]);
|
|
for (PxU32 row = 0; row < 6; ++row)
|
|
{
|
|
const PxU32 rowSpatialInertia = (row < 3) ? row : row - 3; // Convert to the index of a 3 x 3 matrix
|
|
// Only process elements above the diagonal as the matrix is symmetric
|
|
for (PxU32 col = row; col < 6; ++col)
|
|
{
|
|
// This offset is due to how the spatial matrix is indexed and how the linear amd angular components
|
|
// of the acceleration should be inverted, the index is as follows
|
|
// 0 3 6 9 12 15 9 12 15 0 3 6
|
|
// 1 4 7 10 13 16 10 23 16 1 4 7
|
|
// 2 5 8 11 14 17 inversion linear/angular 11 24 17 2 5 8
|
|
// 18 21 24 0 1 2 ------------------------> 0 1 2 18 21 24
|
|
// 19 22 25 3 4 5 3 4 5 19 22 25
|
|
// 20 23 26 6 7 8 6 7 8 20 23 26
|
|
const PxU32 offset = (row > 2) ? 18 : (col < 3) * 9;
|
|
const PxU32 colSpatialInertia = (col < 3) ? col : col - 3; // Convert to the index of a 3 x 3 matrix
|
|
const PxU32 index = offset + colSpatialInertia * 3 + rowSpatialInertia;
|
|
massMatrix[row * matSize + col] = rootSpatialInertia[index];
|
|
massMatrix[col * matSize + row] = rootSpatialInertia[index];
|
|
}
|
|
}
|
|
}
|
|
else if (!fixBase && !rootMotion)
|
|
{
|
|
__syncwarp();
|
|
|
|
Dy::SpatialMatrix baseInvI = spatialInertias[0].invertInertia();
|
|
|
|
for (PxU32 row = threadIndex; row < dofCount; row += WARP_SIZE)
|
|
{
|
|
const Cm::UnAlignedSpatialVector& m0 = worldMotionMatrices[row];
|
|
for (PxU32 col = 0; col < dofCount; ++col)
|
|
{
|
|
const Cm::UnAlignedSpatialVector& m1 = worldMotionMatrices[col];
|
|
massMatrix[row * dofCount + col] -= m0.innerProduct(baseInvI * m1);
|
|
}
|
|
}
|
|
}
|
|
__syncwarp();
|
|
}
|
|
}
|
|
|
|
__device__ static Cm::UnAlignedSpatialVector translateSpatialVector(const PxVec3& offset, const Cm::UnAlignedSpatialVector& vec)
|
|
{
|
|
return Cm::UnAlignedSpatialVector(vec.top, vec.bottom + offset.cross(vec.top));
|
|
}
|
|
|
|
// This is an optimized port of FeatherstoneArticulation::getGeneralizedGravityForce.
|
|
// We process one articulation per warp (32 threads).
|
|
extern "C" __global__ void computeArtiGravityForces(
|
|
const PxU32 nbIndices,
|
|
float* PX_RESTRICT data,
|
|
const PxArticulationGPUIndex* PX_RESTRICT gpuIndices,
|
|
const PxU32 maxDofs,
|
|
const bool rootMotion,
|
|
const PxgArticulation* articulations, const PxVec3 gravity)
|
|
{
|
|
const PxU32 jobIndex = threadIdx.y + blockIdx.x * blockDim.y;
|
|
const PxU32 threadIndex = threadIdx.x;
|
|
|
|
if (jobIndex < nbIndices)
|
|
{
|
|
const PxU32 artiIndex = gpuIndices[jobIndex];
|
|
const PxgArticulation& arti = articulations[artiIndex];
|
|
|
|
const PxU32 linkCount = arti.data.numLinks;
|
|
const PxU32 dofCount = arti.data.numJointDofs;
|
|
const bool fixBase = arti.data.flags & PxArticulationFlag::eFIX_BASE;
|
|
const PxU32 rootDof = (rootMotion && !fixBase) ? 6 : 0; // Add the DoF of the root in the floating base case
|
|
const PxU32 bufferDof = rootMotion ? 6 : 0;
|
|
const PxVec3 tGravity = -gravity;
|
|
|
|
float* PX_RESTRICT gravityCompensationForces = &data[jobIndex * (maxDofs + bufferDof)];
|
|
|
|
if (rootMotion || fixBase)
|
|
{
|
|
// I use this as a tmp buffer to store ZAForce vectors of all links
|
|
Cm::UnAlignedSpatialVector* PX_RESTRICT zAForces = reinterpret_cast<Cm::UnAlignedSpatialVector*>(arti.zAForces); // ???
|
|
|
|
for (PxU32 link = threadIndex; link < linkCount; link += WARP_SIZE)
|
|
{
|
|
const float4 invIM = arti.linkProps[link].invInertiaXYZ_invMass;
|
|
const PxReal mass = invIM.w == 0.f ? 0.f : (1.f / invIM.w);
|
|
zAForces[link].top = tGravity * mass;
|
|
zAForces[link].bottom = PxVec3(0);
|
|
}
|
|
|
|
__syncwarp();
|
|
|
|
for (PxU32 link = (linkCount - 1); link > 0; --link)
|
|
{
|
|
const PxTransform& body2World = arti.linkBody2Worlds[link];
|
|
const PxU32 parentLink = arti.parents[link];
|
|
const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink];
|
|
|
|
if (threadIndex == 0)
|
|
zAForces[parentLink] += translateSpatialVector(body2World.p - parentBody2World.p, zAForces[link]);
|
|
|
|
const ArticulationJointCoreData& jointData = arti.jointData[link];
|
|
PxReal* force = &gravityCompensationForces[jointData.jointOffset];
|
|
|
|
if (threadIndex < jointData.nbDof)
|
|
force[threadIndex + rootDof] = link; // I'll just store dof's link index here and use it in the next loop
|
|
}
|
|
|
|
__syncwarp();
|
|
|
|
for (PxU32 dof = threadIndex; dof < dofCount; dof += WARP_SIZE)
|
|
{
|
|
PxU32 link = (PxU32)gravityCompensationForces[dof + rootDof];
|
|
const ArticulationJointCoreData& jointData = arti.jointData[link];
|
|
const PxTransform& body2World = arti.linkBody2Worlds[link];
|
|
Cm::UnAlignedSpatialVector dofMotion = arti.motionMatrix[link][dof - jointData.jointOffset].rotate(body2World);
|
|
gravityCompensationForces[dof + rootDof] = dofMotion.innerProduct(zAForces[link]);
|
|
}
|
|
|
|
// Add root DoFs contribution
|
|
if (threadIndex == 0 && rootDof == 6)
|
|
{
|
|
gravityCompensationForces[0] = zAForces[0].top.x;
|
|
gravityCompensationForces[1] = zAForces[0].top.y;
|
|
gravityCompensationForces[2] = zAForces[0].top.z;
|
|
gravityCompensationForces[3] = zAForces[0].bottom.x;
|
|
gravityCompensationForces[4] = zAForces[0].bottom.y;
|
|
gravityCompensationForces[5] = zAForces[0].bottom.z;
|
|
}
|
|
|
|
__syncwarp();
|
|
}
|
|
else
|
|
{
|
|
// I use this as a tmp buffer to store world space spatial inertias of all links
|
|
Dy::SpatialMatrix* PX_RESTRICT spatialInertias = reinterpret_cast<Dy::SpatialMatrix*>(arti.worldSpatialArticulatedInertia); // ???
|
|
// I use this as a tmp buffer to store motion velocities of all links ??? Or not. The velocities seem valid. Maybe I don't need to compute them ???
|
|
Cm::UnAlignedSpatialVector* PX_RESTRICT motionVelocities = reinterpret_cast<Cm::UnAlignedSpatialVector*>(arti.motionVelocities); // ???
|
|
// I use this as a tmp buffer to store motion accelerations of all links
|
|
Cm::UnAlignedSpatialVector* PX_RESTRICT motionAccelerations = reinterpret_cast<Cm::UnAlignedSpatialVector*>(arti.motionAccelerations); // ???
|
|
// I use this as a tmp buffer to store ZAForce vectors of all links
|
|
Cm::UnAlignedSpatialVector* PX_RESTRICT zAForces = reinterpret_cast<Cm::UnAlignedSpatialVector*>(arti.zAForces); // ???
|
|
|
|
// Compute links inertias
|
|
|
|
for (PxU32 link = threadIndex; link < linkCount; link += WARP_SIZE)
|
|
{
|
|
const float4 invIM = arti.linkProps[link].invInertiaXYZ_invMass;
|
|
const PxReal mass = invIM.w == 0.f ? 0.f : (1.f / invIM.w);
|
|
const PxVec3 localInertia = PxVec3(invIM.x == 0.f ? 0.f : (1.f / invIM.x), invIM.y == 0.f ? 0.f : (1.f / invIM.y), invIM.z == 0.f ? 0.f : (1.f / invIM.z));
|
|
Dy::SpatialMatrix spatialInertia;
|
|
spatialInertia.topLeft = PxMat33(PxZero);
|
|
spatialInertia.topRight = PxMat33::createDiagonal(PxVec3(mass));
|
|
Cm::transformInertiaTensor(localInertia, PxMat33(arti.linkBody2Worlds[link].q), spatialInertia.bottomLeft);
|
|
spatialInertias[link] = spatialInertia;
|
|
}
|
|
|
|
__syncwarp();
|
|
|
|
if (threadIndex == 0) // @@@ ??? Can I optimize it somehow?
|
|
{
|
|
for (PxU32 link = linkCount - 1; link > 0; --link)
|
|
{
|
|
const PxTransform& body2World = arti.linkBody2Worlds[link];
|
|
const PxU32 parentLink = arti.parents[link];
|
|
const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink];
|
|
const PxVec3 rw = body2World.p - parentBody2World.p;
|
|
PxMat33 skewSymmetric(PxVec3(0, rw.z, -rw.y), PxVec3(-rw.z, 0, rw.x), PxVec3(rw.y, -rw.x, 0));
|
|
Dy::SpatialMatrix parentSpaceSpatialInertia = spatialInertias[link];
|
|
translateInertia(skewSymmetric, parentSpaceSpatialInertia);
|
|
spatialInertias[parentLink] += parentSpaceSpatialInertia;
|
|
}
|
|
}
|
|
|
|
__syncwarp();
|
|
|
|
// Compute motion velocities ??? Should I even do this?
|
|
|
|
if (threadIndex == 0)
|
|
{
|
|
for (PxU32 link = 1; link < linkCount; ++link)
|
|
{
|
|
const PxTransform& body2World = arti.linkBody2Worlds[link];
|
|
const PxU32 parentLink = arti.parents[link];
|
|
const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink];
|
|
motionVelocities[link] = translateSpatialVector(parentBody2World.p - body2World.p, motionVelocities[parentLink]);
|
|
}
|
|
}
|
|
|
|
__syncwarp();
|
|
|
|
// Init za-forces
|
|
|
|
for (PxU32 link = threadIndex; link < linkCount; link += WARP_SIZE)
|
|
{
|
|
const float4 invIM = arti.linkProps[link].invInertiaXYZ_invMass;
|
|
const PxReal mass = invIM.w == 0.f ? 0.f : (1.f / invIM.w);
|
|
const PxMat33& I = spatialInertias[link].bottomLeft;
|
|
const PxVec3& vA = motionVelocities[link].top;
|
|
zAForces[link].top = -tGravity * mass; // ??? @@@ Yea, fix-base version has no minus here.
|
|
zAForces[link].bottom = vA.cross(I * vA);
|
|
}
|
|
|
|
__syncwarp();
|
|
|
|
for (PxU32 link = (linkCount - 1); link > 0; --link)
|
|
{
|
|
const PxTransform& body2World = arti.linkBody2Worlds[link];
|
|
const PxU32 parentLink = arti.parents[link];
|
|
const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink];
|
|
|
|
if (threadIndex == 0)
|
|
zAForces[parentLink] += translateSpatialVector(body2World.p - parentBody2World.p, zAForces[link]);
|
|
}
|
|
|
|
__syncwarp();
|
|
|
|
// Compute motion acceleration
|
|
|
|
if (threadIndex == 0)
|
|
{
|
|
Dy::SpatialMatrix invInertia = spatialInertias[0].invertInertia();
|
|
motionAccelerations[0] = -(invInertia * zAForces[0]);
|
|
}
|
|
for (PxU32 link = 1; link < linkCount; ++link)
|
|
{
|
|
const PxTransform& body2World = arti.linkBody2Worlds[link];
|
|
const PxU32 parentLink = arti.parents[link];
|
|
const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink];
|
|
if (threadIndex == 0)
|
|
{
|
|
motionAccelerations[link] = translateSpatialVector(parentBody2World.p - body2World.p, motionAccelerations[parentLink]);
|
|
zAForces[link] = spatialInertias[link] * motionAccelerations[link] + zAForces[link];
|
|
}
|
|
|
|
const ArticulationJointCoreData& jointData = arti.jointData[link];
|
|
PxReal* force = &gravityCompensationForces[jointData.jointOffset];
|
|
|
|
if (threadIndex < jointData.nbDof)
|
|
force[threadIndex] = link; // I'll just store dof's link index here and use it in the next loop
|
|
}
|
|
|
|
__syncwarp();
|
|
|
|
// Compute joint forces
|
|
|
|
for (PxU32 dof = threadIndex; dof < dofCount; dof += WARP_SIZE)
|
|
{
|
|
PxU32 link = (PxU32)gravityCompensationForces[dof];
|
|
const ArticulationJointCoreData& jointData = arti.jointData[link];
|
|
const PxTransform& body2World = arti.linkBody2Worlds[link];
|
|
Cm::UnAlignedSpatialVector dofMotion = arti.motionMatrix[link][dof - jointData.jointOffset].rotate(body2World);
|
|
gravityCompensationForces[dof] = dofMotion.innerProduct(zAForces[link]);
|
|
}
|
|
|
|
__syncwarp();
|
|
}
|
|
}
|
|
}
|
|
|
|
// This is an optimized port of FeatherstoneArticulation::getCoriolisAndCentrifugalForce.
|
|
// We process one articulation per warp (32 threads).
|
|
extern "C" __global__ void computeArtiCentrifugalForces(
|
|
const PxU32 nbIndices,
|
|
float* PX_RESTRICT data,
|
|
const PxArticulationGPUIndex* PX_RESTRICT gpuIndices,
|
|
const PxU32 maxDofs,
|
|
const bool rootMotion,
|
|
const PxgArticulation* articulations)
|
|
{
|
|
const PxU32 jobIndex = threadIdx.y + blockIdx.x * blockDim.y;
|
|
const PxU32 threadIndex = threadIdx.x;
|
|
|
|
if (jobIndex < nbIndices)
|
|
{
|
|
const PxU32 artiIndex = gpuIndices[jobIndex];
|
|
const PxgArticulation& arti = articulations[artiIndex];
|
|
|
|
Dy::SpatialMatrix* PX_RESTRICT spatialInertias = reinterpret_cast<Dy::SpatialMatrix*>(arti.worldSpatialArticulatedInertia);
|
|
PxReal* PX_RESTRICT jointVelocities = reinterpret_cast<PxReal*>(arti.jointVelocities);
|
|
Cm::UnAlignedSpatialVector* PX_RESTRICT motionVelocities = reinterpret_cast<Cm::UnAlignedSpatialVector*>(arti.motionVelocities);
|
|
Cm::UnAlignedSpatialVector* PX_RESTRICT corioliseVectors = reinterpret_cast<Cm::UnAlignedSpatialVector*>(arti.corioliseVectors);
|
|
// corioliseVectors is used as a tmp buffer to store motion accelerations of all links
|
|
// arti.motionAccelerations cannot be used because it stores important information that should not be erased
|
|
Cm::UnAlignedSpatialVector* PX_RESTRICT motionAccelerations = reinterpret_cast<Cm::UnAlignedSpatialVector*>(arti.corioliseVectors);
|
|
Cm::UnAlignedSpatialVector* PX_RESTRICT zAForces = reinterpret_cast<Cm::UnAlignedSpatialVector*>(arti.zAForces);
|
|
|
|
const PxU32 linkCount = arti.data.numLinks;
|
|
const PxU32 dofCount = arti.data.numJointDofs;
|
|
const bool fixBase = arti.data.flags & PxArticulationFlag::eFIX_BASE;
|
|
const PxU32 rootDof = (rootMotion && !fixBase) ? 6 : 0; // Add the DoF of the root in the floating base case
|
|
const PxU32 bufferDof = rootMotion ? 6 : 0;
|
|
|
|
float* PX_RESTRICT coriolisForces = &data[jobIndex * (maxDofs + bufferDof)];
|
|
|
|
// Velocities
|
|
// It seems to be unnecessary to recalculate the motion velocities as we always call
|
|
// the update kinematic function before calling this function
|
|
if (threadIndex == 0)
|
|
{
|
|
for (PxU32 link = 1; link < linkCount; ++link)
|
|
{
|
|
const PxTransform& body2World = arti.linkBody2Worlds[link];
|
|
const PxU32 parentLink = arti.parents[link];
|
|
const ArticulationJointCoreData& jointData = arti.jointData[link];
|
|
const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink];
|
|
const PxVec3 rw = body2World.p - parentBody2World.p;
|
|
const Cm::UnAlignedSpatialVector pVel = motionVelocities[parentLink];
|
|
|
|
Cm::UnAlignedSpatialVector vel = translateSpatialVector(-rw, pVel);
|
|
|
|
Cm::UnAlignedSpatialVector deltaV = Cm::UnAlignedSpatialVector::Zero();
|
|
for (PxU32 dof = 0; dof < jointData.nbDof; ++dof)
|
|
{
|
|
const PxReal maxJointVelocity = arti.joints[link].maxJointVelocity[dof];
|
|
const PxReal jointVelocity = jointVelocities[jointData.jointOffset + dof];
|
|
PxReal jVel = PxMin(jointVelocity, maxJointVelocity);
|
|
Cm::UnAlignedSpatialVector dofMotion = arti.motionMatrix[link][dof].rotate(body2World);
|
|
deltaV += dofMotion * jVel;
|
|
}
|
|
|
|
vel.top += deltaV.top;
|
|
vel.bottom += deltaV.bottom;
|
|
|
|
const PxVec3 aVec = deltaV.top;
|
|
const PxVec3 force = pVel.top.cross(aVec);
|
|
const PxVec3 lVel = deltaV.bottom;
|
|
const PxVec3 torque = pVel.top.cross(pVel.top.cross(rw)) + 2.f * pVel.top.cross(lVel) + aVec.cross(lVel);
|
|
|
|
corioliseVectors[link] = Cm::SpatialVectorF(force, torque);
|
|
motionVelocities[link] = vel;
|
|
}
|
|
}
|
|
|
|
__syncwarp();
|
|
|
|
if (threadIndex == 0)
|
|
{
|
|
motionAccelerations[0] = Cm::SpatialVectorF::Zero();
|
|
|
|
for (PxU32 link = 1; link < linkCount; ++link)
|
|
{
|
|
const PxTransform& body2World = arti.linkBody2Worlds[link];
|
|
const PxU32 parentLink = arti.parents[link];
|
|
const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink];
|
|
const PxVec3 rw = body2World.p - parentBody2World.p;
|
|
|
|
Cm::UnAlignedSpatialVector pMotionAcceleration = translateSpatialVector(-rw, motionAccelerations[parentLink]);
|
|
|
|
motionAccelerations[link] = pMotionAcceleration + corioliseVectors[link];
|
|
}
|
|
}
|
|
|
|
__syncwarp();
|
|
|
|
for (PxU32 link = threadIndex; link < linkCount; link += WARP_SIZE)
|
|
{
|
|
const PxTransform& body2World = arti.linkBody2Worlds[link];
|
|
const float4 invIM = arti.linkProps[link].invInertiaXYZ_invMass;
|
|
const PxReal mass = invIM.w == 0.f ? 0.f : (1.f / invIM.w);
|
|
const PxVec3 localInertia = PxVec3(invIM.x == 0.f ? 0.f : (1.f / invIM.x), invIM.y == 0.f ? 0.f : (1.f / invIM.y), invIM.z == 0.f ? 0.f : (1.f / invIM.z));
|
|
PxMat33 I; Cm::transformInertiaTensor(localInertia, PxMat33(body2World.q), I);
|
|
const PxVec3& vA = motionVelocities[link].top;
|
|
zAForces[link].top = motionAccelerations[link].bottom * mass;
|
|
zAForces[link].bottom = vA.cross(I * vA) + I * motionAccelerations[link].top;
|
|
}
|
|
|
|
__syncwarp();
|
|
|
|
if (threadIndex == 0)
|
|
{
|
|
for (PxU32 link = (linkCount - 1); link > 0; --link)
|
|
{
|
|
const PxTransform& body2World = arti.linkBody2Worlds[link];
|
|
const PxU32 parentLink = arti.parents[link];
|
|
const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink];
|
|
|
|
zAForces[parentLink] += translateSpatialVector(body2World.p - parentBody2World.p, zAForces[link]);
|
|
|
|
const ArticulationJointCoreData& jointData = arti.jointData[link];
|
|
PxReal* force = &coriolisForces[jointData.jointOffset];
|
|
|
|
for (PxU32 dof = 0; dof < jointData.nbDof; ++dof)
|
|
force[dof + rootDof] = link; // I'll just store dof's link index here and use it in the last loop
|
|
}
|
|
}
|
|
|
|
__syncwarp();
|
|
|
|
if (!fixBase)
|
|
{
|
|
// Compute links inertias
|
|
|
|
for (PxU32 link = threadIndex; link < linkCount; link += WARP_SIZE)
|
|
{
|
|
const float4 invIM = arti.linkProps[link].invInertiaXYZ_invMass;
|
|
const PxReal mass = invIM.w == 0.f ? 0.f : (1.f / invIM.w);
|
|
const PxVec3 localInertia = PxVec3(invIM.x == 0.f ? 0.f : (1.f / invIM.x), invIM.y == 0.f ? 0.f : (1.f / invIM.y), invIM.z == 0.f ? 0.f : (1.f / invIM.z));
|
|
Dy::SpatialMatrix spatialInertia;
|
|
spatialInertia.topLeft = PxMat33(PxZero);
|
|
spatialInertia.topRight = PxMat33::createDiagonal(PxVec3(mass));
|
|
Cm::transformInertiaTensor(localInertia, PxMat33(arti.linkBody2Worlds[link].q), spatialInertia.bottomLeft);
|
|
spatialInertias[link] = spatialInertia;
|
|
}
|
|
|
|
__syncwarp();
|
|
|
|
if (threadIndex == 0)
|
|
{
|
|
for (PxU32 link = linkCount - 1; link > 0; --link)
|
|
{
|
|
const PxTransform& body2World = arti.linkBody2Worlds[link];
|
|
const PxU32 parentLink = arti.parents[link];
|
|
const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink];
|
|
const PxVec3 rw = body2World.p - parentBody2World.p;
|
|
PxMat33 skewSymmetric(PxVec3(0, rw.z, -rw.y), PxVec3(-rw.z, 0, rw.x), PxVec3(rw.y, -rw.x, 0));
|
|
Dy::SpatialMatrix parentSpaceSpatialInertia = spatialInertias[link];
|
|
translateInertia(skewSymmetric, parentSpaceSpatialInertia);
|
|
spatialInertias[parentLink] += parentSpaceSpatialInertia;
|
|
}
|
|
}
|
|
|
|
__syncwarp();
|
|
|
|
// Compute motion acceleration
|
|
|
|
if (threadIndex == 0)
|
|
{
|
|
if (!rootMotion)
|
|
{
|
|
Dy::SpatialMatrix invInertia = spatialInertias[0].invertInertia();
|
|
motionAccelerations[0] = -(invInertia * zAForces[0]);
|
|
}
|
|
|
|
for (PxU32 link = 1; link < linkCount; ++link)
|
|
{
|
|
const PxTransform& body2World = arti.linkBody2Worlds[link];
|
|
const PxU32 parentLink = arti.parents[link];
|
|
const PxTransform& parentBody2World = arti.linkBody2Worlds[parentLink];
|
|
motionAccelerations[link] = translateSpatialVector(parentBody2World.p - body2World.p, motionAccelerations[parentLink]);
|
|
zAForces[link] = spatialInertias[link] * motionAccelerations[link] + zAForces[link];
|
|
}
|
|
}
|
|
|
|
__syncwarp();
|
|
}
|
|
|
|
// Compute joint forces
|
|
|
|
for (PxU32 dof = threadIndex; dof < dofCount; dof += WARP_SIZE)
|
|
{
|
|
PxU32 link = (PxU32)coriolisForces[dof + rootDof];
|
|
const ArticulationJointCoreData& jointData = arti.jointData[link];
|
|
const PxTransform& body2World = arti.linkBody2Worlds[link];
|
|
Cm::UnAlignedSpatialVector dofMotion = arti.motionMatrix[link][dof - jointData.jointOffset].rotate(body2World);
|
|
coriolisForces[dof + rootDof] = dofMotion.innerProduct(zAForces[link]);
|
|
}
|
|
|
|
// Add root contribution
|
|
|
|
if (rootMotion && threadIndex == 0 && !fixBase)
|
|
{
|
|
coriolisForces[0] = zAForces[0].top.x;
|
|
coriolisForces[1] = zAForces[0].top.y;
|
|
coriolisForces[2] = zAForces[0].top.z;
|
|
coriolisForces[3] = zAForces[0].bottom.x;
|
|
coriolisForces[4] = zAForces[0].bottom.y;
|
|
coriolisForces[5] = zAForces[0].bottom.z;
|
|
}
|
|
|
|
__syncwarp();
|
|
}
|
|
}
|
|
|
|
// This is an optimized port of FeatherstoneArticulation::getArticulationCOM
|
|
// We process one articulation per warp (32 threads) and one link per thread.
|
|
// We have to separate the calculation of the total mass from the calculation
|
|
// of the sum of COM in case there are more than 32 links in one articulation.
|
|
extern "C" __global__ void computeArtiCOM(
|
|
PxVec3* PX_RESTRICT data,
|
|
const PxArticulationGPUIndex* PX_RESTRICT gpuIndices,
|
|
const bool rootFrame,
|
|
const PxgArticulation* PX_RESTRICT articulations)
|
|
{
|
|
const PxU32 jobIndex = threadIdx.y + blockIdx.x * blockDim.y;
|
|
const PxU32 threadIndex = threadIdx.x;
|
|
|
|
const PxU32 artiIndex = gpuIndices[jobIndex];
|
|
const PxgArticulation& arti = articulations[artiIndex];
|
|
|
|
PxVec3* articulationCOM = &data[jobIndex];
|
|
const PxU32 linkCount = arti.data.numLinks;
|
|
|
|
// calculate the total mass of the articulation and store it temporarily in articulationCOM->x
|
|
for(PxU32 link = threadIndex; link < linkCount; link += WARP_SIZE)
|
|
{
|
|
const PxReal mass = 1.0f / arti.linkProps[link].invInertiaXYZ_invMass.w;
|
|
|
|
// calculate the total mass and store it temporarily in articulationCOM->x
|
|
PxRedAddGlobal(&articulationCOM->x, mass);
|
|
}
|
|
|
|
__syncwarp();
|
|
|
|
PxReal totalMass = articulationCOM->x;
|
|
|
|
// calculate the COM as the sum of body COM
|
|
for(PxU32 link = threadIndex; link < linkCount; link += WARP_SIZE)
|
|
{
|
|
const PxVec3 childPose = arti.linkBody2Worlds[link].p;
|
|
const PxReal mass = 1.0f / arti.linkProps[link].invInertiaXYZ_invMass.w;
|
|
const PxVec3 COM = childPose * mass;
|
|
|
|
// calculate the COM as the sum of body COM
|
|
PxRedAddGlobal(&articulationCOM->x, COM[0]);
|
|
PxRedAddGlobal(&articulationCOM->y, COM[1]);
|
|
PxRedAddGlobal(&articulationCOM->z, COM[2]);
|
|
}
|
|
|
|
if(threadIndex == 0)
|
|
{
|
|
// remove the total mass from articulationCOM->x
|
|
articulationCOM->x -= totalMass;
|
|
|
|
*articulationCOM /= totalMass;
|
|
|
|
// move to the root frame
|
|
if(rootFrame)
|
|
{
|
|
*articulationCOM = arti.linkBody2Worlds[0].getInverse().transform(*articulationCOM);
|
|
}
|
|
}
|
|
|
|
__syncwarp();
|
|
}
|
|
|
|
// This is an optimized port of FeatherstoneArticulation::getCentroidalMomentumMatrix
|
|
// We process one articulation per warp (32 threads) and one link per thread.
|
|
extern "C" __global__ void computeArtiCentroidalMomentumMatrices(
|
|
float* PX_RESTRICT data,
|
|
const PxArticulationGPUIndex* PX_RESTRICT gpuIndices,
|
|
const PxU32 nbIndices,
|
|
const PxU32 maxDofs,
|
|
const PxgArticulation* PX_RESTRICT articulations)
|
|
{
|
|
const PxU32 jobIndex = threadIdx.y + blockIdx.x * blockDim.y;
|
|
const PxU32 threadIndex = threadIdx.x;
|
|
|
|
if(jobIndex < nbIndices)
|
|
{
|
|
const PxU32 artiIndex = gpuIndices[jobIndex];
|
|
const PxgArticulation& arti = articulations[artiIndex];
|
|
|
|
const PxU32 dofCount = arti.data.numJointDofs;
|
|
const bool fixBase = arti.data.flags & PxArticulationFlag::eFIX_BASE;
|
|
|
|
if(!fixBase)
|
|
{
|
|
const PxU32 startCoriolisForces = (maxDofs + 6) * (maxDofs + 6) * nbIndices;
|
|
const PxU32 startCentroidalMomentumMatrices = startCoriolisForces + (maxDofs + 6) * nbIndices;
|
|
const PxU32 startCentroidalMomentumBias = startCentroidalMomentumMatrices + 6 * (maxDofs + 6) * nbIndices;
|
|
|
|
float* PX_RESTRICT massMatrices = &data[jobIndex * (maxDofs + 6) * (maxDofs + 6)];
|
|
float* PX_RESTRICT coriolisForces = &data[startCoriolisForces + jobIndex * (maxDofs + 6)];
|
|
float* PX_RESTRICT centroidalMomentumMatrix = &data[startCentroidalMomentumMatrices + jobIndex * 6 * (maxDofs + 6)];
|
|
float* PX_RESTRICT centroidalMomentumBias = &data[startCentroidalMomentumBias + jobIndex * 6];
|
|
|
|
// adding mass matrix terms corresponding to the root DoFs
|
|
for(PxU32 j = threadIndex; j < 6 * (dofCount + 6); j += WARP_SIZE)
|
|
{
|
|
centroidalMomentumMatrix[j] = massMatrices[j];
|
|
}
|
|
|
|
// calculating the COM momentum from the spatial articulated inertia of the root in the mass matrix
|
|
const PxReal totalMass = massMatrices[2 * (dofCount + 6) + 2];
|
|
const PxVec3 pg = PxVec3(massMatrices[2 * (dofCount + 6) + 4], massMatrices[5], massMatrices[(dofCount + 6) + 3]) / totalMass;
|
|
const PxMat33 mat(PxVec3(0, pg.z, -pg.y), PxVec3(-pg.z, 0, pg.x), PxVec3(pg.y, -pg.x, 0));
|
|
|
|
// adding remaining contributions
|
|
for(PxU32 col = threadIndex; col < dofCount + 6; col += WARP_SIZE)
|
|
{
|
|
const PxVec3 linMassMatrices = PxVec3(massMatrices[0 * (dofCount + 6) + col], massMatrices[1 * (dofCount + 6) + col],
|
|
massMatrices[2 * (dofCount + 6) + col]);
|
|
const PxVec3 crossTermsCentroidalMomentumMatrix = mat * linMassMatrices;
|
|
for(PxU32 row = 3; row < 6; ++row)
|
|
{
|
|
centroidalMomentumMatrix[col + row * (dofCount + 6)] += crossTermsCentroidalMomentumMatrix[row - 3];
|
|
}
|
|
}
|
|
|
|
if(threadIndex == 0)
|
|
{
|
|
// adding coriolis force terms corresponding to the root DoFs
|
|
for(PxU32 row = 0; row < 6; ++row)
|
|
{
|
|
centroidalMomentumBias[row] = coriolisForces[row];
|
|
}
|
|
|
|
// adding remaining contributions
|
|
const PxVec3 linCoriolisForces = PxVec3(coriolisForces[0], coriolisForces[1], coriolisForces[2]);
|
|
const PxVec3 crossTermsCentroidalMomentumBias = mat * linCoriolisForces;
|
|
for(PxU32 row = 3; row < 6; ++row)
|
|
{
|
|
centroidalMomentumBias[row] += crossTermsCentroidalMomentumBias[row - 3];
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|