feat(physics): wire physx sdk into build

This commit is contained in:
2026-04-15 12:22:15 +08:00
parent 5bf258df6d
commit 31f40e2cbb
2044 changed files with 752623 additions and 1 deletions

View File

@@ -0,0 +1,203 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef PXG_ARTICULATION_CORE_H
#define PXG_ARTICULATION_CORE_H
#include "PxDirectGPUAPI.h"
#include "foundation/PxPreprocessor.h"
#include "foundation/PxSimpleTypes.h"
#include "foundation/PxPinnedArray.h"
#include "foundation/PxUserAllocated.h"
#include "PxgCudaBuffer.h"
#if !PX_CUDA_COMPILER
#include <vector_types.h>
#endif
#include "DyFeatherstoneArticulation.h"
#include "PxgArticulation.h"
namespace physx
{
//this is needed to force PhysXArticulationGpu linkage as Static Library!
void createPxgArticulation();
class PxgCudaKernelWranglerManager;
class PxCudaContextManager;
class PxCudaContext;
class PxgHeapMemoryAllocatorManager;
class PxgGpuContext;
struct PxgSolverReferences;
struct PxgSolverBodySleepData;
struct PxgArticulationCoreDesc;
struct PxgArticulationOutputDesc;
struct PxIndexDataPair;
class PxSceneDesc;
class PxgArticulationCore : public PxUserAllocated
{
public:
PxgArticulationCore(PxgCudaKernelWranglerManager* gpuKernelWrangler, PxCudaContextManager* cudaContextManager, PxgHeapMemoryAllocatorManager* heapMemoryManager);
~PxgArticulationCore();
void gpuMemDmaUpArticulationDesc(const PxU32 offset, const PxU32 nbArticulations, PxReal dt, const PxVec3& gravity, const PxReal invLengthScale, const bool isExternalForcesEveryTgsIterationEnabled);
void createStaticContactAndConstraintsBatch(const PxU32 nbArticulations);
PxU32 computeUnconstrainedVelocities(const PxU32 offset, const PxU32 nbArticulations, PxReal dt, const PxVec3& gravity, const PxReal invLengthScale, const bool isExternalForcesEveryTgsIterationEnabled, bool recomputeBlockFormat);
PxU32 setupInternalConstraints(const PxU32 nbArticulations, const PxReal stepDt, const PxReal dt, const PxReal invDt, const bool isTGSSolver);
void syncStream();
void precomputeDependencies(const PxU32 nbPartitions);
void syncUnconstrainedVelocities();
void propagateRigidBodyImpulsesAndSolveInternalConstraints(const PxReal dt, const PxReal invDt, const bool velocityIteration, const PxReal elapsedTime,
const PxReal biasCoefficient, PxU32* staticContactUniqueIndices, PxU32* staticJointUniqueIndices,
CUdeviceptr sharedDesc, bool doFriction, bool isTGS, bool residualReportingEnabled, bool isExternalForcesEveryTgsIterationEnabled = false);
//These two methods are for articulation vs soft body interaction
void outputVelocity(CUdeviceptr sharedDesc, CUstream solverStream, bool isTGS);
void pushImpulse(CUstream solverStream);
void stepArticulation(const PxReal stepDt);
void averageDeltaV(const PxU32 nbSlabs, CUstream stream, float4* velocities, const PxU32 partitionId,
bool isTGS, CUdeviceptr sharedDescd);
void applyTgsSubstepForces(PxReal stepDt, CUstream stream);
void saveVelocities();
void updateBodies(PxReal dt, bool integrate, bool enableDirectGPUAPI);
void gpuMemDMAbackArticulation(PxInt8ArrayPinned& linkAndJointAndRootStateData,
PxPinnedArray<PxgSolverBodySleepData>& wakeCounterPool, PxPinnedArray<Dy::ErrorAccumulator>& internalResidualPerArticulation, PxPinnedArray<Dy::ErrorAccumulator>& contactResidual);
void setSolverStream(CUstream& solverStream) { mSolverStream = &solverStream; }
void setGpuContext(PxgGpuContext* context) { mGpuContext = context; }
PxgArticulationCoreDesc* getArticulationCoreDesc() { return mArticulationCoreDesc; }
CUdeviceptr getArticulationCoreDescd() { return mArticulationCoreDescd.getDevicePtr(); }
CUdeviceptr getDeferredZ() { return mDeltaVs.getDevicePtr(); }
CUdeviceptr getArticulationDirty() { return mSlabHasChanges.getDevicePtr(); }
CUdeviceptr getArticulationSlabMask() { return mSlabDirtyMasks.getDevicePtr(); }
PxU32 getNbActiveArticulations() const { return mNbActiveArticulation; }
CUstream getStream() { return mStream; }
CUevent getFlushArticulationDataEvent() { return mFlushArticulationDataEvent; }
void synchronizedStreams(CUstream bpStream, CUstream npStream);
bool getArticulationData(void* data, const PxArticulationGPUIndex* gpuIndices, PxArticulationGPUAPIReadType::Enum dataType, PxU32 nbElements, CUevent startEvent, CUevent finishEvent, PxU32 maxLinks, PxU32 maxDofs, PxU32 maxFixedTendons, PxU32 maxTendonJoints, PxU32 maxSpatialTendons, PxU32 maxSpatialAttachments) const;
bool setArticulationData(const void* data, const PxArticulationGPUIndex* gpuIndices, PxArticulationGPUAPIWriteType::Enum dataType, PxU32 nbElements, CUevent startEvent, CUevent finishEvent, PxU32 maxLinks, PxU32 maxDofs, PxU32 maxFixedTendons, PxU32 maxTendonJoints, PxU32 maxSpatialTendons, PxU32 maxSpatialAttachments);
bool computeArticulationData(void* data, const PxArticulationGPUIndex* gpuIndices, PxArticulationGPUAPIComputeType::Enum operation, PxU32 nbElements,
PxU32 maxLinks, PxU32 maxDofs, CUevent startEvent, CUevent finishEvent);
// needed if root transforms or joint positions are updated using direct-GPU API. needs to be called before computeUnconstrainedVelocities.
void updateArticulationsKinematic(bool zeroSimOutput, const PxArticulationGPUIndex* PX_RESTRICT gpuIndices=NULL, PxU32 nbElements=0);
void allocDeltaVBuffer(PxU32 nbSlabs, PxU32 nbPartitions, CUstream stream);
void layoutDeltaVBuffer(const PxU32 nbSlabs, const PxU32 nbPartitions, CUstream stream);
private:
// new Direct-GPU API methods
bool getDofStates(void* data, const PxArticulationGPUIndex* gpuIndices, PxU32 nbElements, PxU32 maxDofs, PxArticulationGPUAPIReadType::Enum dataType) const;
bool getTransformStates(void* data, const PxArticulationGPUIndex* gpuIndices, PxU32 nbElements, PxU32 maxLinks, PxArticulationGPUAPIReadType::Enum dataType) const;
bool getLinkVelocityStates(void* data, const PxArticulationGPUIndex* gpuIndices, PxU32 nbElements, PxU32 maxLinks, PxArticulationGPUAPIReadType::Enum dataType) const;
bool getLinkSpatialForceStates(void* data, const PxArticulationGPUIndex* gpuIndices, PxU32 nbElements, PxU32 maxLinks, PxArticulationGPUAPIReadType::Enum dataType) const;
bool setDofStates(const void* data, const PxArticulationGPUIndex* gpuIndices, PxU32 nbElements, PxU32 maxDofs, PxArticulationGPUAPIWriteType::Enum dataType);
bool setRootGlobalPoseStates(const void* data, const PxArticulationGPUIndex* gpuIndices, PxU32 nbElements);
bool setRootVelocityStates(const void* data, const PxArticulationGPUIndex* gpuIndices, PxU32 nbElements, PxArticulationGPUAPIWriteType::Enum dataType);
bool setLinkForceStates(const void* data, const PxArticulationGPUIndex* gpuIndices, PxU32 nbElements, PxU32 maxLinks);
bool setLinkTorqueStates(const void* data, const PxArticulationGPUIndex* gpuIndices, PxU32 nbElements, PxU32 maxLinks);
bool setTendonStates(const void* data, const PxArticulationGPUIndex* gpuIndices, PxU32 nbElements, PxU32 maxTendons, PxArticulationGPUAPIWriteType::Enum dataType);
bool setSpatialTendonAttachmentStates(const void* data, const PxArticulationGPUIndex* gpuIndices, PxU32 nbElements, PxU32 maxTendonsXmaxAttachments);
bool setFixedTendonJointStates(const void* data, const PxArticulationGPUIndex* gpuIndices, PxU32 nbElements, PxU32 maxFixedTendonsXmaxTendonJoints);
bool getTendonStates(void* data, const PxArticulationGPUIndex* gpuIndices, PxU32 nbElements, PxU32 maxTendons, PxArticulationGPUAPIReadType::Enum dataType) const;
bool getSpatialTendonAttachmentStates(void* data, const PxArticulationGPUIndex* gpuIndices, PxU32 nbElements, PxU32 maxTendonsXmaxAttachments) const;
bool getFixedTendonJointStates(void* data, const PxArticulationGPUIndex* gpuIndices, PxU32 nbElements, PxU32 maxFixedTendonsXmaxTendonJoints) const;
PxgArticulationCoreDesc* mArticulationCoreDesc;
PxgArticulationOutputDesc* mArticulationOutputDesc;
PxgCudaKernelWranglerManager* mGpuKernelWranglerManager;
PxCudaContextManager* mCudaContextManager;
PxCudaContext* mCudaContext;
CUstream mStream;
CUstream* mSolverStream;
CUevent mFinishEvent;
CUevent mFlushArticulationDataEvent;
PxgGpuContext* mGpuContext;
PxgTypedCudaBuffer<PxgArticulationCoreDesc> mArticulationCoreDescd;
PxgTypedCudaBuffer<PxgArticulationOutputDesc> mArticulationOutputDescd;
PxU32 mNbActiveArticulation;
PxgTypedCudaBuffer<Cm::UnAlignedSpatialVector> mDeltaVs;
PxgTypedCudaBuffer<uint2> mSlabHasChanges;
PxgTypedCudaBuffer<uint4> mSlabDirtyMasks;
PxgTypedCudaBuffer<PxgArticulationBitFieldStackData> mPathToRootPerPartition;
PxgTypedCudaBuffer<PxU32> mDirtyLinksPerPartition;
PxgTypedCudaBuffer<PxReal> mImpulseScalePerPartition;
PxgTypedCudaBuffer<PxU32> mTempContactUniqueIndicesBlockBuffer;
PxgTypedCudaBuffer<PxU32> mTempConstraintUniqueIndicesBlockBuffer;
PxgTypedCudaBuffer<PxU32> mTempContactHeaderBlockBuffer;
PxgTypedCudaBuffer<PxU32> mTempConstraintHeaderBlockBuffer;
PxgTypedCudaBuffer<PxU32> mTempSelfContactUniqueIndicesBlockBuffer;
PxgTypedCudaBuffer<PxU32> mTempSelfConstraintUniqueIndicesBlockBuffer;
PxgTypedCudaBuffer<PxU32> mTempSelfContactHeaderBlockBuffer;
PxgTypedCudaBuffer<PxU32> mTempSelfConstraintHeaderBlockBuffer;
CUevent mComputeUnconstrainedEvent;
bool mNeedsKinematicUpdate;
#if PX_SUPPORT_OMNI_PVD
void ovdArticulationCallback(const void* PX_RESTRICT data, const PxRigidDynamicGPUIndex* PX_RESTRICT gpuIndices,
PxArticulationGPUAPIWriteType::Enum dataType, PxU32 nbElements,
PxU32 maxLinks, PxU32 maxDofs, PxU32 maxFixedTendons, PxU32 maxTendonJoints, PxU32 maxSpatialTendons, PxU32 maxSpatialTendonAttachments);
PxPinnedArray<PxU8> mOvdDataBuffer;
PxPinnedArray<PxU8> mOvdIndexBuffer;
#endif
};
}
#endif

View File

@@ -0,0 +1,145 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef PXG_ARTICULATION_CORE_DESC_H
#define PXG_ARTICULATION_CORE_DESC_H
#include "foundation/PxSimpleTypes.h"
#include "foundation/PxVec3.h"
#include "PxgArticulation.h"
namespace physx
{
struct PxgBodySim;
struct PxgSolverBodySleepData;
class PxGpuTendonAttachmentData;
class PxGpuTendonJointCoefficientData;
namespace IG
{
class NodeIndex;
};
namespace Cm
{
struct UnAlignedSpatialVector;
}
struct PxgArticulationCoreDesc
{
PxgBodySim* mBodySimBufferDeviceData;
PxgSolverBodySleepData* articulationSleepData; //sleep data for the articulation
PxNodeIndex* islandNodeIndices;
PxU32* solverBodyIndices;
PxgArticulation* articulations;
PxU32 articulationOffset;//offset to the islandNodeIndices
PxU32 nbArticulations;
PxReal dt;
PxVec3 gravity;
PxReal invLengthScale;
bool isExternalForcesEveryTgsIterationEnabled;
Cm::UnAlignedSpatialVector* impulses;
PxU32 nbSlabs;
PxU32 nbPartitions;
uint2* slabHasChanges; // one uint2 per articulation per slab. Stores up to two link indices to communicate which links holds an impulse from the solver.
uint4* slabDirtyMasks; // one uint4 [linkIndexA, writeIndexA, linkIndexB, writeIndexB] per articulation per partition per slab, index = articulationId + slab * nbArticulations + partitionId * nbArticulations * nbSlabs. Used for internal velocity propagation (artiPropagateVelocityInternal).
//The dirty paths we need to process for each partition. This defines
//how we propagate impulses in the articulations. There is one of these
//per partition, per articulation
PxgArticulationBitFieldStackData* mPathToRootsPerPartition;
//This defines which link is holding the current accumulated impulse. There is
//one entry per articulation per partition
PxU32* mImpulseHoldingLink;
PxReal* mPartitionAverageScale; // One float per partition per articulation. Stores 1/numSlabsInThisPartitionInvolvingThisArticulation, see artiComputeDependencies
PxgArticulationBlockData* mArticulationBlocks;
PxgArticulationBlockLinkData* mArticulationLinkBlocks;
PxgArticulationTraversalStackData* mArticulationTraversalStackBlocks;
PxgArticulationBitFieldStackData* mTempPathToRootBitFieldBlocks;
PxgArticulationBitFieldStackData* mTempSharedBitFieldBlocks;
PxgArticulationBitFieldStackData* mTempRootBitFieldBlocks;
//A quick reminder of the indexing of mPathToRootBitFieldBlocks.
//A bitfield is just a PxU64. It can describe the path to root for a single link in an articulation with 64 links.
//We support more than 64 links so we need a bitfield array to describe the path to root for a single link.
//For each link we need bitfield[maxWordCount] with maxWordCount = (maxLinks+63)/64 to describe the path to the root.
//For each articulation we need bitField[maxLinks*maxWordCount] to describe the path to root for all links.
//The array of bitfields for an articulation with globalWarpIndex will begin at mPathToRootBitFieldBlocks[globalWarpIndex*(maxLinks*maxWordCount))
PxgArticulationBitFieldData* mPathToRootBitFieldBlocks;
PxgArticulationBlockDofData* mArticulationDofBlocks;
PxgArticulationBlockSpatialTendonData* mArticulationSpatialTendonBlocks;
PxgArticulationInternalTendonConstraintData* mArticulationSpatialTendonConstraintBlocks;
PxgArticulationBlockAttachmentData* mArticulationAttachmentBlocks;
PxgArticulationBlockFixedTendonData* mArticulationFixedTendonBlocks;
PxgArticulationInternalTendonConstraintData* mArticulationFixedTendonConstraintBlocks;
PxgArticulationBlockTendonJointData* mArticulationTendonJointBlocks;
PxgArticulationBlockMimicJointData* mArticulationMimicJointBlocks;
PxU32 mMaxLinksPerArticulation;
PxU32 mMaxDofsPerArticulation;
PxU32 mMaxMimicJointsPerArticulation;
PxU32 mMaxSpatialTendonsPerArticulation;
PxU32 mMaxAttachmentPerArticulation;
PxU32 mMaxFixedTendonsPerArticulation;
PxU32 mMaxTendonJointPerArticulation;
PxU32* mTempContactUniqueIndicesBlock;
PxU32* mTempConstraintUniqueIndicesBlock;
PxU32* mTempContactHeaderBlock;
PxU32* mTempConstraintHeaderBlock;
PxU32* mTempSelfContactUniqueIndicesBlock;
PxU32* mTempSelfConstraintUniqueIndicesBlock;
PxU32* mTempSelfContactHeaderBlock;
PxU32* mTempSelfConstraintHeaderBlock;
Dy::ErrorAccumulator mContactErrorAccumulator;
};
struct PxgArticulationOutputDesc
{
public:
//see PxgArticulationLinkJointRootStateData
PxU8* linkAndJointAndRootStateData;
//PxReal* jointPosition_Vel_Accel;
PxgSolverBodySleepData* sleepData;
Dy::ErrorAccumulator* errorAccumulator; //Per articulation, collects internal residuals (no contacts or external PxJoints connected to the articulation)
Dy::ErrorAccumulator* contactResidualAccumulator; //Only one value accumulating contact residuals over all articulations
};
}
#endif

View File

@@ -0,0 +1,76 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef PXG_ARTICULATION_CORE_KERNEL_INDICES_H
#define PXG_ARTICULATION_CORE_KERNEL_INDICES_H
namespace physx
{
struct PxgArticulationCoreKernelBlockDim
{
enum
{
COMPUTE_UNCONSTRAINED_VELOCITES = 64,
UPDATE_BODIES = 128,
SOLVE_INTERNAL_CONSTRAINTS = 32,
COMPUTE_UNCONSTRAINED_SPATIAL_INERTIA = 32,
COMPUTE_UNCONSTRAINED_SPATIAL_INERTIA_PARTIAL = 64,
COMPUTE_STATIC_CONTACT_CONSTRAINT_COUNT = 512,
APPLY_ARTI_STATE = 512,
ARTI_GET_DOF_STATES = 512,
ARTI_GET_TRANSFORM_STATES = 512,
ARTI_GET_VELOCITY_STATES = 480,
ARTI_GET_SPATIAL_FORCE_STATES = 512,
ARTI_GET_TENDON_STATE = 512,
ARTI_GET_SPATIAL_TENDON_ATTACHMENT_STATE = 512,
ARTI_GET_FIXED_TENDON_JOINT_STATE = 512,
ARTI_SET_DOF_STATES = 512,
ARTI_SET_ROOT_GLOBAL_POSE_STATE = 512,
ARTI_SET_ROOT_VELOCITY_STATE = 480,
ARTI_SET_LINK_FORCE_STATE = 480,
ARTI_SET_LINK_TORQUE_STATE = 512,
ARTI_SET_TENDON_STATE = 512,
ARTI_SET_SPATIAL_TENDON_ATTACHMENT_STATE = 512,
ARTI_SET_FIXED_TENDON_JOINT_STATE = 512
};
};
struct PxgArticulationCoreKernelGridDim
{
enum
{
COMPUTE_UNCONSTRAINED_VELOCITES = 64,
UPDATE_BODIES = 64,
UPDATE_KINEMATIC = 1024 // AD: see inline comment in PxgArticulationCore
};
};
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,449 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef __ARTI_DYNAMIC_CUH__
#define __ARTI_DYNAMIC_CUH__
#include "PxgArticulation.h"
#include "PxgArticulationLink.h"
#include "DyFeatherstoneArticulationUtils.h"
#include "DyFeatherstoneArticulation.h"
//This function stores Q-stZ to mDeferredQstZ
static __device__ Cm::UnAlignedSpatialVector propagateImpulseW_0(const PxVec3& childToParent,
PxgArticulationBlockDofData* PX_RESTRICT dofData, const Cm::UnAlignedSpatialVector& Z,
const PxU32 dofCount, const PxU32 threadIndexInWarp,
const PxReal* PX_RESTRICT jointForce = NULL, const PxReal jointForceMultiplier = 1.0f)
{
Cm::UnAlignedSpatialVector temp = Z;
Cm::UnAlignedSpatialVector sas[3];
Cm::UnAlignedSpatialVector isInvD[3];
PxReal jf[3];
// the split into two separate loops is an optimization that allows dispatching the loads as early as possible.
#pragma unroll 3
for (PxU32 ind = 0; ind < 3; ++ind)
{
if (ind < dofCount)
{
sas[ind] = loadSpatialVector(dofData[ind].mWorldMotionMatrix, threadIndexInWarp);
isInvD[ind] = loadSpatialVector(dofData[ind].mIsInvDW, threadIndexInWarp);
jf[ind] = (jointForce ? jointForce[ind] * jointForceMultiplier : 0.0f);
}
}
#pragma unroll 3
for (PxU32 ind = 0; ind < 3; ++ind)
{
if (ind < dofCount)
{
const PxReal stZ = jf[ind] - sas[ind].innerProduct(Z);
dofData[ind].mDeferredQstZ[threadIndexInWarp] += stZ;
temp += isInvD[ind] * stZ;
}
}
//parent space's spatial zero acceleration impulse
return Dy::FeatherstoneArticulation::translateSpatialVector(childToParent, temp);
}
static __device__ Cm::UnAlignedSpatialVector propagateImpulseWTemp(const PxVec3& childToParent,
PxgArticulationBlockDofData* PX_RESTRICT dofData, const Cm::UnAlignedSpatialVector& Z,
const PxU32 dofCount, const PxU32 threadIndexInWarp)
{
Cm::UnAlignedSpatialVector temp = Z;
assert(dofCount<=3);
for (PxU32 ind = 0; ind < 3; ++ind)
{
if(ind<dofCount)
{
const Cm::UnAlignedSpatialVector sa = loadSpatialVector(dofData[ind].mWorldMotionMatrix, threadIndexInWarp);
const Cm::UnAlignedSpatialVector isInvD = loadSpatialVector(dofData[ind].mIsInvDW, threadIndexInWarp);
const PxReal stZ = -sa.innerProduct(Z);
dofData[ind].mTmpQstZ[threadIndexInWarp] += stZ;
temp += isInvD * stZ;
}
}
//parent space's spatial zero acceleration impulse
return Dy::FeatherstoneArticulation::translateSpatialVector(childToParent, temp);
}
static __device__ Cm::UnAlignedSpatialVector propagateImpulseW_1(
const PxVec3& childToParent,
const PxgArticulationBlockDofData* PX_RESTRICT dofData,
const Cm::UnAlignedSpatialVector& Z,
const PxReal* jointDofImpulses, const PxU32 dofCount,
const PxU32 threadIndexInWarp,
PxReal* qstZ)
{
Cm::UnAlignedSpatialVector temp = Z;
assert(dofCount<=3);
for (PxU32 ind = 0; ind < 3; ++ind)
{
if(ind<dofCount)
{
const Cm::UnAlignedSpatialVector sa = loadSpatialVector(dofData[ind].mWorldMotionMatrix, threadIndexInWarp);
const Cm::UnAlignedSpatialVector isInvD = loadSpatialVector(dofData[ind].mIsInvDW, threadIndexInWarp);
const PxReal jointDofImpulse = jointDofImpulses ? jointDofImpulses[ind] : 0.0f;
const PxReal QMinusSTZ = jointDofImpulse - sa.innerProduct(Z);
qstZ[ind] += QMinusSTZ;
temp += isInvD * QMinusSTZ;
}
}
//parent space's spatial zero acceleration impulse
return Dy::FeatherstoneArticulation::translateSpatialVector(childToParent, temp);
}
static __device__ Cm::UnAlignedSpatialVector propagateAccelerationW(const PxVec3& c2p, const float3* invStIsT,
const Cm::UnAlignedSpatialVector* motionMatrix, const Cm::UnAlignedSpatialVector& hDeltaV, const PxU32 dofCount,
const Cm::UnAlignedSpatialVector* IsW, const PxReal* qstZ)
{
Cm::UnAlignedSpatialVector pDeltaV = Dy::FeatherstoneArticulation::translateSpatialVector(-c2p, hDeltaV); //parent velocity change
//Convert parent velocity change into an impulse
PxReal tJointDelta[3] = { 0.f, 0.f, 0.f };
#pragma unroll 3
for (PxU32 ind = 0; ind < 3; ++ind)
{
if (ind < dofCount)
{
//stI * pAcceleration
const PxReal temp = IsW[ind].innerProduct(pDeltaV);
tJointDelta[ind] = (qstZ[ind] - temp);
}
}
assert(dofCount<=3);
for (PxU32 ind = 0; ind < 3; ++ind)
{
if(ind<dofCount)
{
const float3 iStIsTi = invStIsT[ind];
const PxReal jDelta = iStIsTi.x * tJointDelta[0]
+ iStIsTi.y * tJointDelta[1]
+ iStIsTi.z * tJointDelta[2];
pDeltaV += motionMatrix[ind] * jDelta;
}
}
return pDeltaV;
}
static __device__ Cm::UnAlignedSpatialVector computeSpatialJointDelta(
const PxgArticulationBlockDofData* PX_RESTRICT dofData,
const PxReal* PX_RESTRICT QSTZMinusISDotTranslatedParentDeltaV, PxReal* PX_RESTRICT jointDeltaDofSpeeds, const PxU32 dofCount,
const PxU32 threadIndexInWarp)
{
Cm::UnAlignedSpatialVector sas[3];
// the split into two separate loops is an optimization that allows dispatching the loads as early as possible.
#pragma unroll 3
for (PxU32 ind = 0; ind < 3; ++ind)
{
if (ind < dofCount)
{
sas[ind] = loadSpatialVector(dofData[ind].mWorldMotionMatrix, threadIndexInWarp);
}
}
Cm::UnAlignedSpatialVector jointSpatialDeltaV(PxVec3(0.f), PxVec3(0.f));
#pragma unroll 3
for (PxU32 ind = 0; ind < 3; ++ind)
{
if (ind < dofCount)
{
const float iStIsTi_x = dofData[ind].mInvStIsT_x[threadIndexInWarp];
const float iStIsTi_y = dofData[ind].mInvStIsT_y[threadIndexInWarp];
const float iStIsTi_z = dofData[ind].mInvStIsT_z[threadIndexInWarp];
const PxReal jDelta = iStIsTi_x * QSTZMinusISDotTranslatedParentDeltaV[0]
+ iStIsTi_y * QSTZMinusISDotTranslatedParentDeltaV[1]
+ iStIsTi_z * QSTZMinusISDotTranslatedParentDeltaV[2];
if(jointDeltaDofSpeeds)
jointDeltaDofSpeeds[ind] = jDelta;
jointSpatialDeltaV += sas[ind] * jDelta;
}
}
return jointSpatialDeltaV;
}
//This function use mDeferredQstZ
static __device__ Cm::UnAlignedSpatialVector propagateAccelerationW(const PxVec3& c2p,
const PxgArticulationBlockDofData* PX_RESTRICT dofData,
const Cm::UnAlignedSpatialVector& hDeltaV,
const PxU32 dofCount, PxReal* jointDeltaDofSpeeds, const PxU32 threadIndexInWarp)
{
const Cm::UnAlignedSpatialVector pDeltaV = Dy::FeatherstoneArticulation::translateSpatialVector(-c2p, hDeltaV); //parent velocity change
//[(Q - S^T *Z)] - [(I*S).innerProduct(translated(parentDeltaV))]
PxReal QSTZMinusISDotTranslatedParentDeltaV[3] = { 0.f, 0.f, 0.f };
Cm::UnAlignedSpatialVector IsW;
#pragma unroll(3)
for (PxU32 ind = 0; ind < 3; ++ind)
{
if (ind < dofCount)
{
IsW = loadSpatialVector(dofData[ind].mIsW, threadIndexInWarp);
//stI * pAcceleration
const PxReal temp = IsW.innerProduct(pDeltaV);
QSTZMinusISDotTranslatedParentDeltaV[ind] = (dofData[ind].mDeferredQstZ[threadIndexInWarp] - temp);
}
}
const Cm::UnAlignedSpatialVector jointSpatialDeltaV = computeSpatialJointDelta(dofData, QSTZMinusISDotTranslatedParentDeltaV, jointDeltaDofSpeeds, dofCount, threadIndexInWarp);
return pDeltaV + jointSpatialDeltaV;
}
//This function use mTmpQstZ
static __device__ Cm::UnAlignedSpatialVector propagateAccelerationWTemp(const PxVec3& c2p,
const PxgArticulationBlockDofData* PX_RESTRICT dofData,
const Cm::UnAlignedSpatialVector& hDeltaV,
const PxU32 dofCount, const PxU32 threadIndexInWarp)
{
const Cm::UnAlignedSpatialVector pDeltaV = Dy::FeatherstoneArticulation::translateSpatialVector(-c2p, hDeltaV); //parent velocity change
//[(Q - S^T *Z)] - [(I*S).innerProduct(translated(parentDeltaV))]
PxReal QSTZMinusISDotTransaltedParentDeltaV[3] = { 0.f, 0.f, 0.f };
#pragma unroll(3)
for (PxU32 ind = 0; ind < 3; ++ind)
{
if (ind < dofCount)
{
const Cm::UnAlignedSpatialVector IsW = loadSpatialVector(dofData[ind].mIsW, threadIndexInWarp);
//stI * pAcceleration
const PxReal temp = IsW.innerProduct(pDeltaV);
QSTZMinusISDotTransaltedParentDeltaV[ind] = (dofData[ind].mTmpQstZ[threadIndexInWarp] - temp);
}
}
const Cm::UnAlignedSpatialVector jointSpatialDeltaV = computeSpatialJointDelta(dofData, QSTZMinusISDotTransaltedParentDeltaV, NULL, dofCount, threadIndexInWarp);
return pDeltaV + jointSpatialDeltaV;
}
//This function use qstZ as input
static __device__ Cm::UnAlignedSpatialVector propagateAccelerationW(const PxVec3& c2p,
const PxgArticulationBlockDofData* PX_RESTRICT dofData,
const Cm::UnAlignedSpatialVector& hDeltaV,
const PxU32 dofCount, const PxU32 threadIndexInWarp,
const PxReal* qstZ)
{
const Cm::UnAlignedSpatialVector pDeltaV = Dy::FeatherstoneArticulation::translateSpatialVector(-c2p, hDeltaV); //parent velocity change
//[(Q - S^T *Z)] - [(I*S).innerProduct(translated(parentDeltaV))]
PxReal QSTZMinusISDotTransaltedParentDeltaV[3] = { 0.f, 0.f, 0.f };
Cm::UnAlignedSpatialVector IsW;
#pragma unroll(3)
for (PxU32 ind = 0; ind < 3; ++ind)
{
if (ind < dofCount)
{
IsW = loadSpatialVector(dofData[ind].mIsW, threadIndexInWarp);
//stI * pAcceleration
const PxReal temp = IsW.innerProduct(pDeltaV);
QSTZMinusISDotTransaltedParentDeltaV[ind] = (qstZ[ind] - temp);
}
}
const Cm::UnAlignedSpatialVector jointSpatialDeltaV = computeSpatialJointDelta(dofData, QSTZMinusISDotTransaltedParentDeltaV, NULL, dofCount, threadIndexInWarp);
return pDeltaV + jointSpatialDeltaV;
}
static __device__ Cm::UnAlignedSpatialVector propagateAccelerationW(
PxgArticulationBlockLinkData& linkData,
PxgArticulationBlockDofData* dofData,
const PxU32 dofCount,
const Cm::UnAlignedSpatialVector& hDeltaV,
const PxU32 threadIndexInWarp)
{
const float c2px = linkData.mRw_x[threadIndexInWarp];
const float c2py = linkData.mRw_y[threadIndexInWarp];
const float c2pz = linkData.mRw_z[threadIndexInWarp];
float3 invStIsT[3];
PxReal tJointDelta[3] = { 0.f, 0.f, 0.f };
Cm::UnAlignedSpatialVector isW[3];
Cm::UnAlignedSpatialVector mMotionMatrix[3];
PxReal jVel[3];
// the split into three separate loops is an optimization that allows dispatching the loads as early as possible.
#pragma unroll 3
for (PxU32 ind = 0; ind < 3; ++ind)
{
if (ind < dofCount)
{
isW[ind] = loadSpatialVector(dofData[ind].mIsW, threadIndexInWarp);
tJointDelta[ind] = (dofData[ind].mDeferredQstZ[threadIndexInWarp]);
jVel[ind] = dofData[ind].mJointVelocities[threadIndexInWarp];
invStIsT[ind] = make_float3(dofData[ind].mInvStIsT_x[threadIndexInWarp], dofData[ind].mInvStIsT_y[threadIndexInWarp], dofData[ind].mInvStIsT_z[threadIndexInWarp]);
mMotionMatrix[ind] = loadSpatialVector(dofData[ind].mWorldMotionMatrix, threadIndexInWarp);
}
}
Cm::UnAlignedSpatialVector pDeltaV = Dy::FeatherstoneArticulation::translateSpatialVector(PxVec3(-c2px, -c2py, -c2pz), hDeltaV); //parent velocity change
#pragma unroll 3
for (PxU32 ind = 0; ind < 3; ++ind)
{
if (ind < dofCount)
{
tJointDelta[ind] -= isW[ind].innerProduct(pDeltaV);
}
}
#pragma unroll 3
for (PxU32 ind = 0; ind < 3; ++ind)
{
if (ind < dofCount)
{
const PxReal jDelta = invStIsT[ind].x * tJointDelta[0] + invStIsT[ind].y * tJointDelta[1]
+ invStIsT[ind].z * tJointDelta[2];
dofData[ind].mJointVelocities[threadIndexInWarp] = jVel[ind] + jDelta;
pDeltaV += mMotionMatrix[ind] * jDelta;
}
}
return pDeltaV;
}
// There is a another version of this function in forwardDynamic2.cu which additionally writes
// the link velocity to a global buffer
static void __device__ PxcFsFlushVelocity(PxgArticulationBlockData& articulation,
PxgArticulationBlockLinkData* PX_RESTRICT artiLinks,
PxgArticulationBlockDofData* PX_RESTRICT artiDofs,
PxU32 linkCount, bool fixBase, const PxU32 threadIndexInWarp)
{
Cm::UnAlignedSpatialVector deltaV = Cm::UnAlignedSpatialVector::Zero();
Cm::UnAlignedSpatialVector deferredZ = -loadSpatialVector(articulation.mRootDeferredZ, threadIndexInWarp);
if (!fixBase)
{
//ArticulationLink& link = links[0];
// PT: preload data
const Cm::UnAlignedSpatialVector motionVelocity0 = loadSpatialVector(artiLinks[0].mMotionVelocity, threadIndexInWarp);
const Cm::UnAlignedSpatialVector solverSpatialDeltaVel0 = loadSpatialVector(artiLinks[0].mSolverSpatialDeltaVel, threadIndexInWarp);
Dy::SpatialMatrix invInertia;
loadSpatialMatrix(articulation.mInvSpatialArticulatedInertia, threadIndexInWarp, invInertia);
//deltaV = invInertia * (-loadSpatialVector(artiLinks[0].mDeferredZ, threadIndexInWarp));
deltaV = invInertia * deferredZ;
//motionVelocities[0] += deltaV[0];
storeSpatialVector(artiLinks[0].mMotionVelocity, motionVelocity0 + deltaV, threadIndexInWarp);
//storeSpatialVector(artiLinks[0].mDeferredZ, Cm::UnAlignedSpatialVector::Zero(), threadIndexInWarp);
storeSpatialVector(articulation.mRootDeferredZ, Cm::UnAlignedSpatialVector::Zero(), threadIndexInWarp);
storeSpatialVector(artiLinks[0].mSolverSpatialDeltaVel, solverSpatialDeltaVel0 + deltaV, threadIndexInWarp);
}
storeSpatialVector(artiLinks[0].mScratchDeltaV, deltaV, threadIndexInWarp);
addSpatialVector(artiLinks[0].mConstraintForces, deferredZ, threadIndexInWarp);
storeSpatialVector(articulation.mCommonLinkDeltaVelocity, Cm::UnAlignedSpatialVector::Zero(), threadIndexInWarp);
PxgArticulationBlockDofData* dofs = artiDofs;
if (linkCount > 1)
{
Cm::UnAlignedSpatialVector nextMotionV = loadSpatialVector(artiLinks[1].mMotionVelocity, threadIndexInWarp);
PxU32 nextNbDofs = artiLinks[1].mDofs[threadIndexInWarp];
PxU32 nextParent = artiLinks[1].mParents[threadIndexInWarp];
//Cm::UnAlignedSpatialVector nextDeferredZ = loadSpatialVector(artiLinks[1].mDeferredZ, threadIndexInWarp);
for (PxU32 i = 1; i < linkCount; i++)
{
PxgArticulationBlockLinkData& tLink = artiLinks[i];
const PxU32 nbDofs = nextNbDofs;
const PxU32 parent = nextParent;
const Cm::UnAlignedSpatialVector preloadedConstraintForces = loadSpatialVector(tLink.mConstraintForces, threadIndexInWarp);
const Cm::UnAlignedSpatialVector preloadedSolverSpatialDeltaVel = loadSpatialVector(tLink.mSolverSpatialDeltaVel, threadIndexInWarp);
Cm::UnAlignedSpatialVector motionV = nextMotionV;
//const Cm::UnAlignedSpatialVector deferredZ = nextDeferredZ;
//storeSpatialVector(tLink.mDeferredZ, Cm::UnAlignedSpatialVector::Zero(), threadIndexInWarp);
if ((i + 1) < linkCount)
{
nextMotionV = loadSpatialVector(artiLinks[i + 1].mMotionVelocity, threadIndexInWarp);
nextNbDofs = artiLinks[i + 1].mDofs[threadIndexInWarp];
nextParent = artiLinks[i + 1].mParents[threadIndexInWarp];
//nextDeferredZ = loadSpatialVector(artiLinks[i + 1].mDeferredZ, threadIndexInWarp);
}
if (parent != (i - 1))
deltaV = loadSpatialVector(artiLinks[parent].mScratchDeltaV, threadIndexInWarp);
deltaV = propagateAccelerationW(tLink, dofs, nbDofs, deltaV, threadIndexInWarp);
//Accumulate the DeltaVel arising from solver impulses applied to this link.
storeSpatialVector(tLink.mSolverSpatialDeltaVel, preloadedSolverSpatialDeltaVel + deltaV, threadIndexInWarp);
//zeroing mDeferredQstZ
for (PxU32 ind = 0; ind < nbDofs; ++ind)
{
dofs[ind].mDeferredQstZ[threadIndexInWarp] = 0.f;
}
motionV += deltaV;
storeSpatialVector(tLink.mScratchDeltaV, deltaV, threadIndexInWarp);
//const PxTransform& tBody2World = poses[i];
storeSpatialVector(tLink.mMotionVelocity, motionV, threadIndexInWarp);
storeSpatialVector(tLink.mConstraintForces, preloadedConstraintForces + deltaV, threadIndexInWarp);
dofs += nbDofs;
}
}
}
#endif

View File

@@ -0,0 +1,387 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef __ARTI_IMPULSE_RESPONSE_CUH__
#define __ARTI_IMPULSE_RESPONSE_CUH__
#include "PxgSolverBody.h"
#include "PxSpatialMatrix.h"
#include "PxgArticulation.h"
#include "PxgArticulationLink.h"
#include "DyFeatherstoneArticulationJointData.h"
#include "foundation/PxPreprocessor.h"
#include "solver/PxSolverDefs.h"
#include "MemoryAllocator.cuh"
#include "articulationDynamic.cuh"
#include "DyFeatherstoneArticulation.h"
#include "solverBlock.cuh"
using namespace physx;
static __device__ PX_FORCE_INLINE PxU32 articulationLowestSetBit(ArticulationBitField val)
{
return val == 0 ? 0 : __ffsll(val) - 1;
}
static __device__ PX_FORCE_INLINE PxU32 articulationHighestSetBit(ArticulationBitField val)
{
const PxU32 nbZeros = __clzll(val);
return 63 - nbZeros;
}
static __device__ Cm::UnAlignedSpatialVector propagateAccelerationWNoJVelUpdate(
const PxgArticulationBlockLinkData& linkData,
const PxgArticulationBlockDofData* const dofData,
const PxU32 dofCount,
const Cm::UnAlignedSpatialVector& hDeltaV,
const PxU32 threadIndexInWarp)
{
const float c2px = linkData.mRw_x[threadIndexInWarp];
const float c2py = linkData.mRw_y[threadIndexInWarp];
const float c2pz = linkData.mRw_z[threadIndexInWarp];
Cm::UnAlignedSpatialVector pDeltaV = Dy::FeatherstoneArticulation::translateSpatialVector(PxVec3(-c2px, -c2py, -c2pz), hDeltaV); //parent velocity change
float3 invStIsT[3];
PxReal tJointDelta[3] = { 0.f, 0.f, 0.f };
Cm::UnAlignedSpatialVector isW[3];
Cm::UnAlignedSpatialVector motionMatrix[3];
// the split into three separate loops is an optimization that allows dispatching the loads as early as possible.
#pragma unroll 3
for (PxU32 ind = 0; ind < 3; ++ind)
{
if (ind < dofCount)
{
isW[ind] = loadSpatialVector(dofData[ind].mIsW, threadIndexInWarp);
invStIsT[ind] = make_float3(dofData[ind].mInvStIsT_x[threadIndexInWarp], dofData[ind].mInvStIsT_y[threadIndexInWarp], dofData[ind].mInvStIsT_z[threadIndexInWarp]);
tJointDelta[ind] = dofData[ind].mDeferredQstZ[threadIndexInWarp];
motionMatrix[ind] = loadSpatialVector(dofData[ind].mWorldMotionMatrix, threadIndexInWarp);
}
}
#pragma unroll 3
for (PxU32 ind = 0; ind < 3; ++ind)
{
if (ind < dofCount)
{
tJointDelta[ind] -= isW[ind].innerProduct(pDeltaV);
}
}
#pragma unroll 3
for (PxU32 ind = 0; ind < 3; ++ind)
{
if (ind < dofCount)
{
const PxReal jDelta = invStIsT[ind].x * tJointDelta[0] + invStIsT[ind].y * tJointDelta[1]
+ invStIsT[ind].z * tJointDelta[2];
pDeltaV += motionMatrix[ind] * jDelta;
}
}
return pDeltaV;
}
static __device__ PX_FORCE_INLINE void averageLinkImpulsesAndPropagate(uint2* PX_RESTRICT isSlabDirty, const Cm::UnAlignedSpatialVector* PX_RESTRICT impulses,
PxgArticulationBlockData& PX_RESTRICT articulationBlock, PxgArticulationBlockLinkData* PX_RESTRICT artiLinks,
PxgArticulationBlockDofData* const PX_RESTRICT artiDofs,
const PxU32 articulationId, const PxU32 maxLinks, const PxU32 nbArticulations, const PxU32 nbSlabs,
const PxU32 nbLinks, const PxU32 threadIndexInWarp, PxReal scale = 0.0f)
{
const PxU32 slabStepSize = maxLinks * nbArticulations;
if (scale == 0.0f) // When scale is not input, compute the scale here. This method is not momentum-conserving.
{
scale = 1.0f;
if (nbSlabs > 1)
{
PxReal count = 0.0f;
for (PxU32 s = 0, slabDirtyIndex = articulationId; s < nbSlabs; ++s, slabDirtyIndex += nbArticulations)
{
const uint2 dirty = isSlabDirty[slabDirtyIndex];
if (dirty.x != 0xFFFFFFFF || dirty.y != 0xFFFFFFFF)
{
count += 1.0f;
}
}
scale = count > 1.0f ? 1.0f / count : 1.0f;
}
}
PxU32 maxIndex = articulationBlock.mLinkWithDeferredImpulse[threadIndexInWarp];
for (PxU32 s = 0, slabOffset = articulationId, slabDirtyIndex = articulationId; s < nbSlabs; ++s, slabOffset += slabStepSize, slabDirtyIndex += nbArticulations)
{
const uint2 dirtyIndex2 = isSlabDirty[slabDirtyIndex];
for(PxU32 i = 0, dirtyIndex = dirtyIndex2.x; i < 2; ++i, dirtyIndex = dirtyIndex2.y)
{
//PxU32 dirtyIndex = i == 0 ? dirtyIndex2.x : dirtyIndex2.y;
if (dirtyIndex != 0xFFFFFFFF)
{
const Cm::UnAlignedSpatialVector preloadedScratchImpulse = loadSpatialVector(artiLinks[dirtyIndex].mScratchImpulse, threadIndexInWarp);
//Get the index of the dirty slab...
const PxU32 deltaIdx = slabOffset + dirtyIndex * nbArticulations;
//Cm::UnAlignedSpatialVector impulse = impulses[deltaIdx] * scale;
Cm::UnAlignedSpatialVector impulse;
float4* f4;
float2* f2;
size_t ptr = reinterpret_cast<size_t>(&impulses[deltaIdx]);
if (ptr & 0xf)
{
f2 = reinterpret_cast<float2*>(ptr);
f4 = reinterpret_cast<float4*>(f2 + 1);
const float2 v0 = *f2;
const float4 v1 = *f4;
impulse.top.x = v0.x; impulse.top.y = v0.y; impulse.top.z = v1.x;
impulse.bottom.x = v1.y; impulse.bottom.y = v1.z; impulse.bottom.z = v1.w;
}
else
{
f4 = reinterpret_cast<float4*>(ptr);
f2 = reinterpret_cast<float2*>(f4 + 1);
const float4 v0 = *f4;
const float2 v1 = *f2;
impulse.top.x = v0.x; impulse.top.y = v0.y; impulse.top.z = v0.z;
impulse.bottom.x = v0.w; impulse.bottom.y = v1.x; impulse.bottom.z = v1.y;
}
*f4 = make_float4(0.f);
*f2 = make_float2(0.f);
//impulses[deltaIdx] = Cm::UnAlignedSpatialVector::Zero();
storeSpatialVector(artiLinks[dirtyIndex].mScratchImpulse, preloadedScratchImpulse + impulse*scale, threadIndexInWarp);
maxIndex = PxMax(dirtyIndex, maxIndex);
}
}
isSlabDirty[slabDirtyIndex] = make_uint2(0xFFFFFFFF, 0xFFFFFFFF);
}
{
const Cm::UnAlignedSpatialVector preloadedRootDeferredZ = loadSpatialVector(articulationBlock.mRootDeferredZ, threadIndexInWarp);
if (maxIndex)
{
PxgArticulationBlockDofData* PX_RESTRICT dofData = &artiDofs[artiLinks[maxIndex].mJointOffset[threadIndexInWarp]];
PxU32 nextParent = artiLinks[maxIndex].mParents[threadIndexInWarp];
PxU32 nextDofs = artiLinks[maxIndex].mDofs[threadIndexInWarp];
float nextChild2Parentx = artiLinks[maxIndex].mRw_x[threadIndexInWarp];
float nextChild2Parenty = artiLinks[maxIndex].mRw_y[threadIndexInWarp];
float nextChild2Parentz = artiLinks[maxIndex].mRw_z[threadIndexInWarp];
//Cm::UnAlignedSpatialVector nextDeferredZ = loadSpatialVector(artiLinks[maxIndex].mDeferredZ, threadIndexInWarp);
for (PxU32 linkID = maxIndex; linkID > 0; linkID--)
{
PxgArticulationBlockLinkData& linkData = artiLinks[linkID];
//Can't preload because this could have been written to
const Cm::UnAlignedSpatialVector Z = loadSpatialVector(linkData.mScratchImpulse, threadIndexInWarp);
const Cm::UnAlignedSpatialVector solverSpatialImpulse = loadSpatialVector(linkData.mSolverSpatialImpulse, threadIndexInWarp);
const PxU32 parent = nextParent;
const Cm::UnAlignedSpatialVector parentScratchImpulse = loadSpatialVector(artiLinks[parent].mScratchImpulse, threadIndexInWarp);
const float child2Parentx = nextChild2Parentx;
const float child2Parenty = nextChild2Parenty;
const float child2Parentz = nextChild2Parentz;
const PxU32 dofCount = nextDofs;
//Cm::UnAlignedSpatialVector deferredZ = nextDeferredZ;
if (linkID > 1)
{
PxU32 nextIndex = linkID - 1;
nextParent = artiLinks[nextIndex].mParents[threadIndexInWarp];
nextDofs = artiLinks[nextIndex].mDofs[threadIndexInWarp];
nextChild2Parentx = artiLinks[nextIndex].mRw_x[threadIndexInWarp];
nextChild2Parenty = artiLinks[nextIndex].mRw_y[threadIndexInWarp];
nextChild2Parentz = artiLinks[nextIndex].mRw_z[threadIndexInWarp];
//nextDeferredZ = loadSpatialVector(artiLinks[nextIndex].mDeferredZ, threadIndexInWarp);
}
const Cm::UnAlignedSpatialVector propagatedZ = propagateImpulseW_0(PxVec3(child2Parentx, child2Parenty, child2Parentz),
dofData, Z,
dofCount, threadIndexInWarp);
//Accumulate the solver impulses applied to this link.
storeSpatialVector(linkData.mSolverSpatialImpulse, solverSpatialImpulse + Z, threadIndexInWarp);
//KS - we should be able to remove mImpulses once we are 100% certain that we will not have any deferredZ residuals
storeSpatialVector(artiLinks[parent].mScratchImpulse, parentScratchImpulse + propagatedZ, threadIndexInWarp);
storeSpatialVector(linkData.mScratchImpulse, Cm::UnAlignedSpatialVector::Zero(), threadIndexInWarp);
dofData -= nextDofs;
}
}
// PT: we can't preload artiLinks[0].mScratchImpulse, as it's modified by the above loop
const Cm::UnAlignedSpatialVector preloadedRootScratchImpulse = loadSpatialVector(artiLinks[0].mScratchImpulse, threadIndexInWarp);
storeSpatialVector(artiLinks[0].mScratchImpulse, Cm::UnAlignedSpatialVector::Zero(), threadIndexInWarp);
storeSpatialVector(articulationBlock.mRootDeferredZ, preloadedRootScratchImpulse + preloadedRootDeferredZ, threadIndexInWarp);
}
}
//This method works out the largest index (furthest down the tree) link that we can propagate to such that we can accumulate
//all impulses acting on the articulation into a single impulse value. We mark this link as dirty and terminate.
//The subsequent kernel requesting velocities uses this cached information to compute the velocity for the desired link.
//It is hoped that spatial locality of contacts in many cases will result in this avoiding doing a brute force propagate
//to all links in the articulation
static __device__ void averageLinkImpulsesAndPropagate2(uint2* PX_RESTRICT isSlabDirty, Cm::UnAlignedSpatialVector* PX_RESTRICT impulses,
PxgArticulationBlockData& PX_RESTRICT articulationBlock, PxgArticulationBlockLinkData* PX_RESTRICT artiLinks,
PxgArticulationBlockDofData* const PX_RESTRICT artiDofs,
const PxU32 articulationId, const PxU32 maxLinks, const PxU32 nbArticulations, const PxU32 nbSlabs,
const PxU32 nbLinks, const PxU32 threadIndexInWarp, const PxReal scale,
PxgArticulationBitFieldStackData* PX_RESTRICT pathToRootPerPartition, const PxU32 wordSize,
const PxU32 commonNode, const PxU32 dirtyFlags)
{
if (dirtyFlags & PxgArtiStateDirtyFlag::eHAS_IMPULSES)
{
const PxU32 slabStepSize = maxLinks * nbArticulations;
for (PxU32 s = 0, slabOffset = articulationId, slabDirtyIndex = articulationId; s < nbSlabs; ++s, slabOffset += slabStepSize, slabDirtyIndex += nbArticulations)
{
const uint2 dirtyIndex2 = isSlabDirty[slabDirtyIndex];
for (PxU32 i = 0, dirtyIndex = dirtyIndex2.x; i < 2; ++i, dirtyIndex = dirtyIndex2.y)
{
//PxU32 dirtyIndex = i == 0 ? dirtyIndex2.x : dirtyIndex2.y;
if (dirtyIndex != 0xFFFFFFFF)
{
const Cm::UnAlignedSpatialVector preloadedScratchImpulse = loadSpatialVector(artiLinks[dirtyIndex].mScratchImpulse, threadIndexInWarp);
//Get the index of the dirty slab...
const PxU32 deltaIdx = slabOffset + dirtyIndex * nbArticulations;
Cm::UnAlignedSpatialVector impulse;
float4* f4;
float2* f2;
size_t ptr = reinterpret_cast<size_t>(&impulses[deltaIdx]);
if (ptr & 0xf)
{
f2 = reinterpret_cast<float2*>(ptr);
f4 = reinterpret_cast<float4*>(f2 + 1);
const float2 v0 = *f2;
const float4 v1 = *f4;
impulse.top.x = v0.x; impulse.top.y = v0.y; impulse.top.z = v1.x;
impulse.bottom.x = v1.y; impulse.bottom.y = v1.z; impulse.bottom.z = v1.w;
}
else
{
f4 = reinterpret_cast<float4*>(ptr);
f2 = reinterpret_cast<float2*>(f4 + 1);
const float4 v0 = *f4;
const float2 v1 = *f2;
impulse.top.x = v0.x; impulse.top.y = v0.y; impulse.top.z = v0.z;
impulse.bottom.x = v0.w; impulse.bottom.y = v1.x; impulse.bottom.z = v1.y;
}
*f4 = make_float4(0.f);
*f2 = make_float2(0.f);
impulse = impulse*scale;
storeSpatialVector(artiLinks[dirtyIndex].mScratchImpulse, preloadedScratchImpulse + impulse, threadIndexInWarp);
}
}
isSlabDirty[slabDirtyIndex] = make_uint2(0xFFFFFFFF, 0xFFFFFFFF);
}
}
// Traverse up from last to front...
for (PxI32 j = wordSize-1, bitOffset = (wordSize-1)*64; j >= 0; j--, bitOffset -= 64)
{
PxU64 word = pathToRootPerPartition[j].bitField[threadIndexInWarp];
if (word != 0)
{
while (word)
{
PxU32 bitIndex = articulationHighestSetBit(word);
const PxU32 index = bitIndex + bitOffset;
if (index == commonNode)
break; //We reached the common node so terminate the traversal
word &= (~(1ull << bitIndex)); //Clear this bit
PxgArticulationBlockLinkData& linkData = artiLinks[index];
PxgArticulationBlockDofData* PX_RESTRICT dofData = &artiDofs[linkData.mJointOffset[threadIndexInWarp]];
const PxU32 parent = linkData.mParents[threadIndexInWarp];
const float child2Parentx = linkData.mRw_x[threadIndexInWarp];
const float child2Parenty = linkData.mRw_y[threadIndexInWarp];
const float child2Parentz = linkData.mRw_z[threadIndexInWarp];
const PxU32 dofCount = linkData.mDofs[threadIndexInWarp];
const Cm::UnAlignedSpatialVector Z = loadSpatialVector(linkData.mScratchImpulse, threadIndexInWarp);
const Cm::UnAlignedSpatialVector parentScratchImpulse = loadSpatialVector(artiLinks[parent].mScratchImpulse, threadIndexInWarp);
const Cm::UnAlignedSpatialVector solverSpatialImpulse = loadSpatialVector(linkData.mSolverSpatialImpulse, threadIndexInWarp);
const Cm::UnAlignedSpatialVector propagatedZ = propagateImpulseW_0(PxVec3(child2Parentx, child2Parenty, child2Parentz),
dofData, Z,
dofCount, threadIndexInWarp);
//Accumulate the solver impulses applied to this link.
storeSpatialVector(linkData.mSolverSpatialImpulse, solverSpatialImpulse + Z, threadIndexInWarp);
//KS - we should be able to remove mImpulses once we are 100% certain that we will not have any deferredZ residuals
storeSpatialVector(artiLinks[parent].mScratchImpulse, parentScratchImpulse + propagatedZ, threadIndexInWarp);
storeSpatialVector(linkData.mScratchImpulse, Cm::UnAlignedSpatialVector::Zero(), threadIndexInWarp);
}
}
}
//(1) Compute updated link velocity...
PxSpatialMatrix mat;
loadSpatialMatrix(artiLinks[commonNode].mSpatialResponseMatrix, threadIndexInWarp, mat);
const Cm::UnAlignedSpatialVector deltaV = mat * (-loadSpatialVector(artiLinks[commonNode].mScratchImpulse, threadIndexInWarp));
// It is important that we DO NOT reset the artiLinks[commonNode].mScratchImpulse to zero here because it has not been propagated
// to the root (contrary to the subtree of the commonNode, whose impulses we just propagated).
// The resetting of the artiLinks[commonNode].mScratchImpulse will be done eventually by the averageLinkImpulsesAndPropagate function
// that goes all the way to the root.
storeSpatialVector(articulationBlock.mCommonLinkDeltaVelocity, deltaV, threadIndexInWarp);
articulationBlock.mLinkWithDeferredImpulse[threadIndexInWarp] = commonNode;
}
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff