feat(physics): wire physx sdk into build

This commit is contained in:
2026-04-15 12:22:15 +08:00
parent 5bf258df6d
commit 31f40e2cbb
2044 changed files with 752623 additions and 1 deletions

View File

@@ -0,0 +1,875 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#include "vector_types.h"
#include "foundation/PxVec3.h"
#include "cutil_math.h"
#include "PxgParticleSystemCoreKernelIndices.h"
#include "reduction.cuh"
#include "shuffle.cuh"
#include "PxgSolverCoreDesc.h"
#include "PxNodeIndex.h"
#include "assert.h"
#include "PxgSimulationCoreDesc.h"
#include "PxgArticulationCoreDesc.h"
using namespace physx;
extern "C" __host__ void initSimulationControllerKernels2() {}
/*
* This kernel takes a *sorted* list of PxNodeIndex (of size *numContacts),
* and a corresponding deltaV list.
*
* deltaV is updated with the cumulativ delta values, such that for each last entry of a rigid body deltaV is the total sum
* for that rigid body - *however* only for the rigid body entries which are processed within one block.
* blockRigidId, blockDeltaV are updated, such that entry represents the deltaV sum and rigid body ID, of the last
* occuring rigid body in that corresponding block. This can then be used in the subsequent kernel to complete the sum
* for rigid bodies, which entries are overlapping a block.
*/
extern "C" __global__ void accumulateDeltaVRigidFirstLaunch(
const PxU64* sortedRigidIds, //input
const PxU32* numContacts, //input
float4* deltaV, //input/output
float4* blockDeltaV, //output
PxU64* blockRigidId //output
)
{
__shared__ PxU64 sRigidId[PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA + 1];
//numWarpsPerBlock can't be larger than 32
const PxU32 numWarpsPerBlock = PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA / WARP_SIZE;
__shared__ float4 sLinWarpAccumulator[WARP_SIZE];
__shared__ float4 sAngWarpAccumulator[WARP_SIZE];
__shared__ PxU64 sWarpRigidId[WARP_SIZE];
__shared__ float4 sLinBlockAccumulator;
__shared__ float4 sAngBlockAccumulator;
__shared__ PxU64 sBlockRigidId;
const PxU32 tNumContacts = *numContacts;
const PxU32 nbBlocksRequired = (tNumContacts + blockDim.x - 1) / blockDim.x;
const PxU32 nbIterationsPerBlock = (nbBlocksRequired + gridDim.x - 1) / gridDim.x;
const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE - 1);
const PxU32 warpIndex = threadIdx.x / WARP_SIZE;
if (threadIdx.x < 4)
{
float* tLinBlockAccumulator = reinterpret_cast<float*>(&sLinBlockAccumulator.x);
tLinBlockAccumulator[threadIdx.x] = 0.f;
float* tAngBlockAccumulator = reinterpret_cast<float*>(&sAngBlockAccumulator.x);
tAngBlockAccumulator[threadIdx.x] = 0.f;
if (threadIdx.x == 0)
sBlockRigidId = 0x8fffffffffffffff;
}
__syncthreads();
for (PxU32 i = 0; i < nbIterationsPerBlock; ++i)
{
const PxU32 workIndex = blockDim.x*(blockIdx.x*nbIterationsPerBlock + i) + threadIdx.x;
PxU64 rigidId = 0x8fffffffffffffff;
float4 linDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
float4 angDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
if (workIndex < tNumContacts)
{
rigidId = sortedRigidIds[workIndex];
sRigidId[threadIdx.x] = rigidId;
linDeltaV = deltaV[workIndex];
angDeltaV = deltaV[workIndex + tNumContacts];
}
__syncthreads();
for (PxU32 reductionRadius = 1; reductionRadius < WARP_SIZE; reductionRadius <<= 1)
{
const PxU32 lane = threadIndexInWarp - reductionRadius;
float4 linVal = shuffle(FULL_MASK, linDeltaV, lane);
float4 angVal = shuffle(FULL_MASK, angDeltaV, lane);
//workIndex < tNumContacts guarantees that sRigidId[WARP_SIZE * warpIndex + lane]
//always points to initialized memory, since lane is always smaller than threadIndexInWarp.
if (threadIndexInWarp >= reductionRadius && workIndex < tNumContacts &&
rigidId == sRigidId[WARP_SIZE * warpIndex + lane])
{
linDeltaV += linVal;
angDeltaV += angVal;
}
}
if (threadIndexInWarp == (WARP_SIZE - 1))
{
sLinWarpAccumulator[warpIndex] = linDeltaV;
sAngWarpAccumulator[warpIndex] = angDeltaV;
sWarpRigidId[warpIndex] = rigidId;
}
const float4 prevLinBlockAccumulator = sLinBlockAccumulator;
const float4 prevAngBlockAccumulator = sAngBlockAccumulator;
const PxU64 prevBlockRigidId = sBlockRigidId;
//Don't allow write until we've finished all reading...
__syncthreads();
if (warpIndex == 0)
{
float4 linDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
float4 angDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
PxU64 warpRigidId = 0x8fffffffffffffff;
if (threadIndexInWarp < numWarpsPerBlock)
{
linDeltaV = sLinWarpAccumulator[threadIndexInWarp];
angDeltaV = sAngWarpAccumulator[threadIndexInWarp];
warpRigidId = sWarpRigidId[threadIndexInWarp];
}
float4 tLinDeltaV = linDeltaV;
float4 tAngDeltaV = angDeltaV;
for (PxU32 reductionRadius = 1; reductionRadius < numWarpsPerBlock; reductionRadius <<= 1)
{
const PxU32 lane = threadIndexInWarp - reductionRadius;
float4 linVal = shuffle(FULL_MASK, tLinDeltaV, lane);
float4 angVal = shuffle(FULL_MASK, tAngDeltaV, lane);
if (threadIndexInWarp >= reductionRadius && warpRigidId == sWarpRigidId[lane])
{
tLinDeltaV += linVal;
tAngDeltaV += angVal;
}
}
if (threadIndexInWarp == (numWarpsPerBlock - 1))
{
if (sBlockRigidId != warpRigidId)
{
//need to clear block accumulators in case previous iteration
//stored other sBlockRigidId
sLinBlockAccumulator = make_float4(0.f, 0.f, 0.f, 0.f);
sAngBlockAccumulator = make_float4(0.f, 0.f, 0.f, 0.f);
}
sLinBlockAccumulator += tLinDeltaV;
sAngBlockAccumulator += tAngDeltaV;
sBlockRigidId = warpRigidId;
}
sLinWarpAccumulator[threadIndexInWarp] = tLinDeltaV;
sAngWarpAccumulator[threadIndexInWarp] = tAngDeltaV;
}
__syncthreads();
if (workIndex < tNumContacts)
{
float4 accumLin = make_float4(0.f, 0.f, 0.f, 0.f);
float4 accumAng = make_float4(0.f, 0.f, 0.f, 0.f);
//if rigidId and the previous element rigid Id is the same, we need to add the previous warp accumulate velocity to
//the current rigid body
if (warpIndex > 0 && rigidId == sWarpRigidId[warpIndex - 1])
{
accumLin = sLinWarpAccumulator[warpIndex - 1];
accumAng = sAngWarpAccumulator[warpIndex - 1];
}
if (i != 0 && rigidId == prevBlockRigidId)
{
accumLin += prevLinBlockAccumulator;
accumAng += prevAngBlockAccumulator;
}
//Now output both offsets...
deltaV[workIndex] = linDeltaV + accumLin;
deltaV[workIndex + tNumContacts] = angDeltaV + accumAng;
}
}
if (threadIdx.x == 0)
{
blockDeltaV[blockIdx.x] = sLinBlockAccumulator;
blockDeltaV[blockIdx.x + PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA] = sAngBlockAccumulator;
blockRigidId[blockIdx.x] = sBlockRigidId;
}
}
//32 blocks. Each block compute the exclusive ransum for the blockOffset
extern "C" __global__ void accumulateDeltaVRigidSecondLaunch(
const PxU64* sortedRigidIds, //input
const PxU32* numContacts, //input
const float4* deltaV, //input
const float4* blockDeltaV, //input
const PxU64* blockRigidId, //input
PxgPrePrepDesc* prePrepDesc,
PxgSolverCoreDesc* solverCoreDesc,
PxgArticulationCoreDesc* artiCoreDesc,
PxgSolverSharedDesc<IterativeSolveData>* sharedDesc,
const bool isTGS
)
{
__shared__ float4 sBlockLinDeltaV[PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
__shared__ float4 sBlockAngDeltaV[PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
__shared__ PxU64 sBlockRigidId[PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
__shared__ PxU64 sRigidId[PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA + 1];
const PxU32 tNumContacts = *numContacts;
const PxU32 nbBlocksRequired = (tNumContacts + blockDim.x - 1) / blockDim.x;
const PxU32 nbIterationsPerBlock = (nbBlocksRequired + gridDim.x - 1) / gridDim.x;
const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE - 1);
float4 linBlockDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
float4 angBlockDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
PxU64 tBlockRigidId = 0x8fffffffffffffff;
if (threadIdx.x < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA)
{
linBlockDeltaV = blockDeltaV[threadIdx.x];
angBlockDeltaV = blockDeltaV[threadIdx.x + PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
tBlockRigidId = blockRigidId[threadIdx.x];
sBlockLinDeltaV[threadIdx.x] = linBlockDeltaV;
sBlockAngDeltaV[threadIdx.x] = angBlockDeltaV;
sBlockRigidId[threadIdx.x] = tBlockRigidId;
}
__syncthreads(); //sBlockRigidId is written above and read below
float4 tLinDeltaV = linBlockDeltaV;
float4 tAngDeltaV = angBlockDeltaV;
//add on block deltaV if blockRigid id match
for (PxU32 reductionRadius = 1; reductionRadius < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA; reductionRadius <<= 1)
{
const PxU32 lane = threadIndexInWarp - reductionRadius;
float4 linVal = shuffle(FULL_MASK, tLinDeltaV, lane);
float4 angVal = shuffle(FULL_MASK, tAngDeltaV, lane);
if (threadIndexInWarp >= reductionRadius && tBlockRigidId == sBlockRigidId[lane])
{
tLinDeltaV += linVal;
tAngDeltaV += angVal;
}
}
__syncthreads(); //sBlockRigidId is read above and written below
if (threadIdx.x < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA)
{
sBlockLinDeltaV[threadIdx.x] = tLinDeltaV;
sBlockAngDeltaV[threadIdx.x] = tAngDeltaV;
sBlockRigidId[threadIdx.x] = blockRigidId[threadIdx.x];
}
__syncthreads();
float4* solverBodyDeltaVel = sharedDesc->iterativeData.solverBodyVelPool + solverCoreDesc->accumulatedBodyDeltaVOffset;
//float4* initialVel = solverCoreDesc->outSolverVelocity;
const PxU32 numSolverBodies = solverCoreDesc->numSolverBodies;
PxgArticulationBlockData* artiData = artiCoreDesc->mArticulationBlocks;
PxgArticulationBlockLinkData* artiLinkData = artiCoreDesc->mArticulationLinkBlocks;
const PxU32 maxLinks = artiCoreDesc->mMaxLinksPerArticulation;
for (PxU32 i = 0; i < nbIterationsPerBlock; ++i)
{
__syncthreads(); //sRigidId is read and written in the same loop - read and write must be separated by syncs
const PxU32 workIndex = blockDim.x * (blockIdx.x * nbIterationsPerBlock + i) + threadIdx.x;
PxU64 rigidId = 0x8fffffffffffffff;
if (workIndex < tNumContacts)
{
rigidId = sortedRigidIds[workIndex];
if (threadIdx.x > 0)
sRigidId[threadIdx.x - 1] = rigidId;
if (workIndex == tNumContacts - 1)
{
sRigidId[threadIdx.x] = 0x8fffffffffffffff;
}
else if (threadIdx.x == PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA - 1)
{
// first thread in block must load neighbor particle
sRigidId[threadIdx.x] = sortedRigidIds[workIndex + 1];
}
}
__syncthreads();
if (workIndex < tNumContacts)
{
float4 accumLin = make_float4(0.f, 0.f, 0.f, 0.f);
float4 accumAng = make_float4(0.f, 0.f, 0.f, 0.f);
if (rigidId != sRigidId[threadIdx.x])
{
float4 linVel = deltaV[workIndex];
float4 angVel = deltaV[workIndex + tNumContacts];
PxU64 preBlockRigidId = blockIdx.x > 0 ? sBlockRigidId[blockIdx.x - 1] : 0x8fffffffffffffff;
if (rigidId == preBlockRigidId)
{
linVel += sBlockLinDeltaV[blockIdx.x - 1];
angVel += sBlockAngDeltaV[blockIdx.x - 1];
}
//nodeIndex
const PxNodeIndex nodeId = reinterpret_cast<PxNodeIndex&>(rigidId);
PxU32 solverBodyIndex = 0;
if (!nodeId.isStaticBody())
{
PxU32 nodeIndex = nodeId.index();
solverBodyIndex = prePrepDesc->solverBodyIndices[nodeIndex];
if (nodeId.isArticulation())
{
//solverBodyIndex is the globalThreadIndex for the active articulation in the block format
const PxU32 blockIndex = solverBodyIndex / WARP_SIZE;
PxgArticulationBlockData& articulation = artiData[blockIndex];
PxgArticulationBlockLinkData* artiLinks = &artiLinkData[blockIndex * maxLinks];
const PxU32 artiIndexInBlock = solverBodyIndex % WARP_SIZE;
articulation.mStateDirty[artiIndexInBlock] = PxgArtiStateDirtyFlag::eHAS_IMPULSES;
const PxU32 linkID = nodeId.articulationLinkId();
const PxReal denom = PxMax(1.0f, linVel.w);
PxReal ratio = 1.f / denom;
linVel.w = 0.f;
//for articulation, linVel and angVel accumulate impulse
Cm::UnAlignedSpatialVector impulse;
impulse.top = PxVec3(linVel.x, linVel.y, linVel.z );
impulse.bottom = PxVec3(angVel.x, angVel.y, angVel.z);
impulse.top *= ratio;
impulse.bottom *= ratio;
/*printf("blockIndex %i artiIndexInBlock %i linkID %i ratio %f impulse linear(%f, %f, %f) angular(%f, %f, %f)\n", blockIndex, artiIndexInBlock, linkID, ratio,
impulse.top.x, impulse.top.y, impulse.top.z, impulse.bottom.x, impulse.bottom.y, impulse.bottom.z);*/
storeSpatialVector(artiLinks[linkID].mScratchImpulse, -impulse, artiIndexInBlock);
}
else
{
float4 linearVelocity = solverBodyDeltaVel[solverBodyIndex];
float4 angularVelocity = solverBodyDeltaVel[solverBodyIndex + numSolverBodies];
const PxReal denom = PxMax(1.0f, linVel.w);
PxReal ratio = 1.f / denom;
linVel.w = 0.f;
if (isTGS)
{
linearVelocity.x += linVel.x * ratio;
linearVelocity.y += linVel.y * ratio;
linearVelocity.z += linVel.z * ratio;
linearVelocity.w += angVel.x * ratio;
angularVelocity.x += angVel.y * ratio;
angularVelocity.y += angVel.z * ratio;
//The rest is the delta position buffer
}
else
{
/*assert(PxIsFinite(linVel.x)); assert(PxIsFinite(linVel.y)); assert(PxIsFinite(linVel.z));
assert(PxIsFinite(angVel.x)); assert(PxIsFinite(angVel.y)); assert(PxIsFinite(angVel.z));
assert(PxIsFinite(ratio));*/
/*printf("Accum linVelDelta = (%f, %f, %f, %f), angVelDelta = (%f, %f, %f, %f), ratio = %f, denom = %f, globalRelax = %f\n",
linVel.x, linVel.y, linVel.z, linVel.w, angVel.x, angVel.y, angVel.z, angVel.w, ratio, denom, globalRelaxationCoefficient);*/
linearVelocity += linVel * ratio;
angularVelocity += angVel * ratio;
}
solverBodyDeltaVel[solverBodyIndex] = linearVelocity;
solverBodyDeltaVel[solverBodyIndex + numSolverBodies] = angularVelocity;
}
//printf("solverBodyIndex %i\n", solverBodyIndex);
//printf("linearVelocity(%f, %f, %f, %f)\n", linearVelocity.x, linearVelocity.y, linearVelocity.z, linearVelocity.w);
//printf("angularVelocity(%f, %f, %f, %f)\n", angularVelocity.x, angularVelocity.y, angularVelocity.z, angularVelocity.w);
}
}
}
}
}
//32 blocks. Each block compute the exclusive ransum for the blockOffset
extern "C" __global__ void clearDeltaVRigidSecondLaunchMulti(
PxU64* sortedRigidIds, //input
PxU32* numContacts, //input
PxgPrePrepDesc* prePrepDesc,
PxgSolverCoreDesc* solverCoreDesc,
PxgArticulationCoreDesc* artiCoreDesc,
PxReal* tempDenom
)
{
__shared__ PxU64 sRigidId[PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA + 1];
const PxU32 tNumContacts = *numContacts;
const PxU32 idx = threadIdx.x;
const PxU32 totalBlockRequired = (tNumContacts + (PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA - 1)) / PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA;
const PxU32 numIterationPerBlock = (totalBlockRequired + (PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA - 1)) / PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA;
PxgArticulationBlockLinkData* artiLinkData = artiCoreDesc->mArticulationLinkBlocks;
const PxU32 maxLinks = artiCoreDesc->mMaxLinksPerArticulation;
for (PxU32 i = 0; i < numIterationPerBlock; ++i)
{
const PxU32 workIndex = i * PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA + idx + numIterationPerBlock * blockIdx.x * blockDim.x;
PxU64 rigidId = 0x8fffffffffffffff;
if (workIndex < tNumContacts)
{
rigidId = sortedRigidIds[workIndex];
if (idx > 0)
sRigidId[idx - 1] = rigidId;
if (workIndex == tNumContacts - 1)
{
sRigidId[idx] = 0x8fffffffffffffff;
}
else if (threadIdx.x == PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA - 1)
{
// first thread in block must load neighbor particle
sRigidId[idx] = sortedRigidIds[workIndex + 1];
}
}
__syncthreads();
if (workIndex < tNumContacts)
{
if (rigidId != sRigidId[idx])
{
//nodeIndex
const PxNodeIndex nodeId = reinterpret_cast<PxNodeIndex&>(rigidId);
PxU32 solverBodyIndex = 0;
if (!nodeId.isStaticBody())
{
PxU32 nodeIndex = nodeId.index();
solverBodyIndex = prePrepDesc->solverBodyIndices[nodeIndex];
if (nodeId.isArticulation())
{
//solverBodyIndex is the globalThreadIndex for the active articulation in the block format
const PxU32 blockIndex = solverBodyIndex / WARP_SIZE;
PxgArticulationBlockLinkData* artiLinks = &artiLinkData[blockIndex * maxLinks];
const PxU32 artiIndexInBlock = solverBodyIndex % WARP_SIZE;
const PxU32 linkID = nodeId.articulationLinkId();
artiLinks[linkID].mDeltaScale[artiIndexInBlock] = 0.f;
}
else
{
tempDenom[solverBodyIndex] = 0.f;
}
}
}
}
}
}
//32 blocks. Each block compute the exclusive ransum for the blockOffset
extern "C" __global__ void accumulateDeltaVRigidSecondLaunchMultiStage1(
PxU64* sortedRigidIds, //input
PxU32* numContacts, //input
float4* deltaV, //input
float4* blockDeltaV, //input
PxU64* blockRigidId, //input
PxgPrePrepDesc* prePrepDesc,
PxgSolverCoreDesc* solverCoreDesc,
PxgArticulationCoreDesc* artiCoreDesc,
PxgSolverSharedDesc<IterativeSolveData>* sharedDesc,
PxReal* tempDenom,
const bool useLocalRelax,
const float globalRelaxationCoefficient,
bool isTGS
)
{
__shared__ PxReal sBlockLinDeltaVW[PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
__shared__ PxU64 sBlockRigidId[PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
__shared__ PxU64 sRigidId[PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA + 1];
const PxU32 tNumContacts = *numContacts;
const PxU32 idx = threadIdx.x;
const PxU32 threadIndexInWarp = idx & (WARP_SIZE - 1);
const PxU32 totalBlockRequired = (tNumContacts + (PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA - 1)) / PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA;
float4 linBlockDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
float4 angBlockDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
PxU64 tBlockRigidId = 0x8fffffffffffffff;
if (idx < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA)
{
linBlockDeltaV = blockDeltaV[idx];
tBlockRigidId = blockRigidId[idx];
sBlockLinDeltaVW[idx] = linBlockDeltaV.w;
sBlockRigidId[idx] = tBlockRigidId;
}
__syncthreads(); //sBlockRigidId is written above and read below
PxReal tLinDeltaVW = linBlockDeltaV.w;
//add on block deltaV if blockRigid id match
for (PxU32 reductionRadius = 1; reductionRadius < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA; reductionRadius <<= 1)
{
const PxU32 lane = threadIndexInWarp - reductionRadius;
PxReal w = __shfl_sync(FULL_MASK, tLinDeltaVW, lane);
if (threadIndexInWarp >= reductionRadius && tBlockRigidId == sBlockRigidId[lane])
{
tLinDeltaVW += w;
}
}
__syncthreads(); //sBlockRigidId is read above and written below
if (idx < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA)
{
sBlockLinDeltaVW[idx] = tLinDeltaVW;
sBlockRigidId[idx] = blockRigidId[idx];
}
const PxU32 numIterationPerBlock = (totalBlockRequired + (PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA - 1)) / PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA;
__syncthreads();
PxgArticulationBlockLinkData* artiLinkData = artiCoreDesc->mArticulationLinkBlocks;
const PxU32 maxLinks = artiCoreDesc->mMaxLinksPerArticulation;
for (PxU32 i = 0; i < numIterationPerBlock; ++i)
{
const PxU32 workIndex = i * PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA + idx + numIterationPerBlock * blockIdx.x * blockDim.x;
PxU64 rigidId = 0x8fffffffffffffff;
if (workIndex < tNumContacts)
{
rigidId = sortedRigidIds[workIndex];
if (idx > 0)
sRigidId[idx - 1] = rigidId;
if (workIndex == tNumContacts - 1)
{
sRigidId[idx] = 0x8fffffffffffffff;
}
else if (threadIdx.x == PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA - 1)
{
// first thread in block must load neighbor particle
sRigidId[idx] = sortedRigidIds[workIndex + 1];
}
}
__syncthreads();
if (workIndex < tNumContacts)
{
if (rigidId != sRigidId[idx])
{
PxReal linVelW = deltaV[workIndex].w;
PxU64 preBlockRigidId = blockIdx.x > 0 ? sBlockRigidId[blockIdx.x - 1] : 0x8fffffffffffffff;
if (rigidId == preBlockRigidId)
{
linVelW += sBlockLinDeltaVW[blockIdx.x - 1];
}
//nodeIndex
const PxNodeIndex nodeId = reinterpret_cast<PxNodeIndex&>(rigidId);
PxU32 solverBodyIndex = 0;
if (!nodeId.isStaticBody())
{
PxU32 nodeIndex = nodeId.index();
solverBodyIndex = prePrepDesc->solverBodyIndices[nodeIndex];
PxReal denom = globalRelaxationCoefficient;
if (useLocalRelax)
denom = PxMax(denom, linVelW);
if (nodeId.isArticulation())
{
//solverBodyIndex is the globalThreadIndex for the active articulation in the block format
const PxU32 blockIndex = solverBodyIndex / WARP_SIZE;
PxgArticulationBlockLinkData* artiLinks = &artiLinkData[blockIndex * maxLinks];
const PxU32 artiIndexInBlock = solverBodyIndex % WARP_SIZE;
const PxU32 linkID = nodeId.articulationLinkId();
atomicAdd(&artiLinks[linkID].mDeltaScale[artiIndexInBlock], denom);
}
else
{
atomicAdd(&tempDenom[solverBodyIndex], denom);
}
}
}
}
}
}
//32 blocks. Each block compute the exclusive ransum for the blockOffset
extern "C" __global__ void accumulateDeltaVRigidSecondLaunchMultiStage2(
PxU64* sortedRigidIds, //input
PxU32* numContacts, //input
float4* deltaV, //input
float4* blockDeltaV, //input
PxU64* blockRigidId, //input
PxgPrePrepDesc* prePrepDesc,
PxgSolverCoreDesc* solverCoreDesc,
PxgArticulationCoreDesc* artiCoreDesc,
PxgSolverSharedDesc<IterativeSolveData>* sharedDesc,
PxReal* tempDenom,
const bool useLocalRelax,
const float globalRelaxationCoefficient,
bool isTGS
)
{
__shared__ float4 sBlockLinDeltaV[PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
__shared__ float4 sBlockAngDeltaV[PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
__shared__ PxU64 sBlockRigidId[PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
__shared__ PxU64 sRigidId[PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA + 1];
const PxU32 tNumContacts = *numContacts;
const PxU32 idx = threadIdx.x;
const PxU32 threadIndexInWarp = idx & (WARP_SIZE - 1);
const PxU32 totalBlockRequired = (tNumContacts + (PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA - 1)) / PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA;
float4 linBlockDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
float4 angBlockDeltaV = make_float4(0.f, 0.f, 0.f, 0.f);
PxU64 tBlockRigidId = 0x8fffffffffffffff;
if (idx < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA)
{
linBlockDeltaV = blockDeltaV[idx];
angBlockDeltaV = blockDeltaV[idx + PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA];
tBlockRigidId = blockRigidId[idx];
sBlockLinDeltaV[idx] = linBlockDeltaV;
sBlockAngDeltaV[idx] = angBlockDeltaV;
sBlockRigidId[idx] = tBlockRigidId;
}
__syncthreads(); //sBlockRigidId is written above and read below
float4 tLinDeltaV = linBlockDeltaV;
float4 tAngDeltaV = angBlockDeltaV;
//add on block deltaV if blockRigid id match
for (PxU32 reductionRadius = 1; reductionRadius < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA; reductionRadius <<= 1)
{
const PxU32 lane = threadIndexInWarp - reductionRadius;
float4 linVal = shuffle(FULL_MASK, tLinDeltaV, lane);
float4 angVal = shuffle(FULL_MASK, tAngDeltaV, lane);
if (threadIndexInWarp >= reductionRadius && tBlockRigidId == sBlockRigidId[lane])
{
tLinDeltaV += linVal;
tAngDeltaV += angVal;
}
}
__syncthreads(); //sBlockRigidId is read above and written below
if (idx < PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA)
{
sBlockLinDeltaV[idx] = tLinDeltaV;
sBlockAngDeltaV[idx] = tAngDeltaV;
sBlockRigidId[idx] = blockRigidId[idx];
}
const PxU32 numIterationPerBlock = (totalBlockRequired + (PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA - 1)) / PxgParticleSystemKernelGridDim::ACCUMULATE_DELTA;
__syncthreads();
float4* solverBodyDeltaVel = sharedDesc->iterativeData.solverBodyVelPool + solverCoreDesc->accumulatedBodyDeltaVOffset;
//float4* initialVel = solverCoreDesc->outSolverVelocity;
const PxU32 numSolverBodies = solverCoreDesc->numSolverBodies;
PxgArticulationBlockData* artiData = artiCoreDesc->mArticulationBlocks;
PxgArticulationBlockLinkData* artiLinkData = artiCoreDesc->mArticulationLinkBlocks;
const PxU32 maxLinks = artiCoreDesc->mMaxLinksPerArticulation;
for (PxU32 i = 0; i < numIterationPerBlock; ++i)
{
const PxU32 workIndex = i * PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA + idx + numIterationPerBlock * blockIdx.x * blockDim.x;
PxU64 rigidId = 0x8fffffffffffffff;
if (workIndex < tNumContacts)
{
rigidId = sortedRigidIds[workIndex];
if (idx > 0)
sRigidId[idx - 1] = rigidId;
if (workIndex == tNumContacts - 1)
{
sRigidId[idx] = 0x8fffffffffffffff;
}
else if (threadIdx.x == PxgParticleSystemKernelBlockDim::ACCUMULATE_DELTA - 1)
{
// first thread in block must load neighbor particle
sRigidId[idx] = sortedRigidIds[workIndex + 1];
}
}
__syncthreads();
if (workIndex < tNumContacts)
{
float4 accumLin = make_float4(0.f, 0.f, 0.f, 0.f);
float4 accumAng = make_float4(0.f, 0.f, 0.f, 0.f);
if (rigidId != sRigidId[idx])
{
float4 linVel = deltaV[workIndex];
float4 angVel = deltaV[workIndex + tNumContacts];
PxU64 preBlockRigidId = blockIdx.x > 0 ? sBlockRigidId[blockIdx.x - 1] : 0x8fffffffffffffff;
if (rigidId == preBlockRigidId)
{
linVel += sBlockLinDeltaV[blockIdx.x - 1];
angVel += sBlockAngDeltaV[blockIdx.x - 1];
}
//nodeIndex
const PxNodeIndex nodeId = reinterpret_cast<PxNodeIndex&>(rigidId);
PxU32 solverBodyIndex = 0;
if (!nodeId.isStaticBody())
{
PxU32 nodeIndex = nodeId.index();
solverBodyIndex = prePrepDesc->solverBodyIndices[nodeIndex];
if (nodeId.isArticulation())
{
//solverBodyIndex is the globalThreadIndex for the active articulation in the block format
const PxU32 blockIndex = solverBodyIndex / WARP_SIZE;
PxgArticulationBlockData& articulation = artiData[blockIndex];
PxgArticulationBlockLinkData* artiLinks = &artiLinkData[blockIndex * maxLinks];
const PxU32 artiIndexInBlock = solverBodyIndex % WARP_SIZE;
articulation.mStateDirty[artiIndexInBlock] = PxgArtiStateDirtyFlag::eHAS_IMPULSES;
const PxU32 linkID = nodeId.articulationLinkId();
PxReal denom = artiLinks[linkID].mDeltaScale[artiIndexInBlock];
PxReal ratio = 1.f / denom;
//for articulation, linVel and angVel accumulate impulse
Cm::UnAlignedSpatialVector impulse;
impulse.top = PxVec3(linVel.x, linVel.y, linVel.z);
impulse.bottom = PxVec3(angVel.x, angVel.y, angVel.z);
impulse.top *= ratio;
impulse.bottom *= ratio;
/*printf("blockIndex %i artiIndexInBlock %i linkID %i ratio %f impulse linear(%f, %f, %f) angular(%f, %f, %f)\n", blockIndex, artiIndexInBlock, linkID, ratio,
impulse.top.x, impulse.top.y, impulse.top.z, impulse.bottom.x, impulse.bottom.y, impulse.bottom.z);*/
atomicAddSpatialVector(artiLinks[linkID].mScratchImpulse, -impulse, artiIndexInBlock);
}
else
{
PxReal denom = tempDenom[solverBodyIndex];// globalRelaxationCoefficient;
PxReal ratio = 1.f / denom;
if (isTGS)
{
atomicAdd(&solverBodyDeltaVel[solverBodyIndex].x, linVel.x*ratio);
atomicAdd(&solverBodyDeltaVel[solverBodyIndex].y, linVel.y*ratio);
atomicAdd(&solverBodyDeltaVel[solverBodyIndex].z, linVel.z*ratio);
atomicAdd(&solverBodyDeltaVel[solverBodyIndex].w, angVel.x*ratio);
atomicAdd(&solverBodyDeltaVel[solverBodyIndex + numSolverBodies].x, angVel.y*ratio);
atomicAdd(&solverBodyDeltaVel[solverBodyIndex + numSolverBodies].y, angVel.z*ratio);
//The rest is the delta position buffer
}
else
{
atomicAdd(&solverBodyDeltaVel[solverBodyIndex].x, linVel.x*ratio);
atomicAdd(&solverBodyDeltaVel[solverBodyIndex].y, linVel.y*ratio);
atomicAdd(&solverBodyDeltaVel[solverBodyIndex].z, linVel.z*ratio);
atomicAdd(&solverBodyDeltaVel[solverBodyIndex + numSolverBodies].x, angVel.x*ratio);
atomicAdd(&solverBodyDeltaVel[solverBodyIndex + numSolverBodies].y, angVel.y*ratio);
atomicAdd(&solverBodyDeltaVel[solverBodyIndex + numSolverBodies].z, angVel.z*ratio);
}
}
//printf("solverBodyIndex %i\n", solverBodyIndex);
//printf("linearVelocity(%f, %f, %f, %f)\n", linearVelocity.x, linearVelocity.y, linearVelocity.z, linearVelocity.w);
//printf("angularVelocity(%f, %f, %f, %f)\n", angularVelocity.x, angularVelocity.y, angularVelocity.z, angularVelocity.w);
}
}
}
}
}