Files
XCEngine/engine/third_party/physx/source/gpusolver/src/CUDA/solverMultiBlock.cu

1097 lines
45 KiB
Plaintext

// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#include "common/PxPhysXCommonConfig.h"
#include <cuda.h>
#include <sm_35_intrinsics.h>
#include "PxgSolverBody.h"
#include "PxgSolverConstraintBlock1D.h"
#include "PxgSolverConstraintDesc.h"
#include "PxgConstraint.h"
#include "PxgConstraintBlock.h"
#include "PxgIslandContext.h"
#include "PxgSolverContext.h"
#include "cutil_math.h"
#include "PxgSolverCoreDesc.h"
#include "solverBlock.cuh"
#include "PxgSolverKernelIndices.h"
#include "PxgDynamicsConfiguration.h"
#include "PxgIntrinsics.h"
#include "stdio.h"
#include "assert.h"
#include "reduction.cuh"
#include "solver.cuh"
#include "PxgArticulationCoreDesc.h"
using namespace physx;
extern "C" __host__ void initSolverKernels6() {}
PX_CUDA_CALLABLE PX_FORCE_INLINE PxU32 ComputeBodyBatchStartIndex(const PxU32 index)
{
//return 2*(index & (~31)) + (index&31);
return index;
}
extern "C" __global__ void ZeroBodies(const PxgSolverCoreDesc* constraintPrepDesc, const PxgSolverSharedDesc<IterativeSolveData>* sharedDesc)
{
__shared__ float4* bodyVelocities;
__shared__ float4* motionVelocities;
__shared__ uint totalNumBodies;
__shared__ uint totalNumBodiesConstraints;
__shared__ uint offset;
if(threadIdx.x == 0)
{
bodyVelocities = sharedDesc->iterativeData.solverBodyVelPool;
motionVelocities = constraintPrepDesc->motionVelocityArray;
totalNumBodies = constraintPrepDesc->numSolverBodies*2;
totalNumBodiesConstraints = (constraintPrepDesc->numBatches + constraintPrepDesc->numArticBatches) * 32 * 2 *2;
offset = constraintPrepDesc->accumulatedBodyDeltaVOffset;
}
const uint blockStride = (blockDim.x * gridDim.x);
//This identifies which warp a specific thread is in
const uint threadIndex = (threadIdx.x + blockIdx.x * blockDim.x);
__syncthreads();
const float4 zero = make_float4(0.f);
//(1) Set all motion velocities to zero.
//(2) Set the delta velocities in the accumulation offset to zero
for(uint a = threadIndex; a < totalNumBodies; a+=blockStride)
{
motionVelocities[a] = zero;
bodyVelocities[offset + a] = zero;
}
//(2) Set all velocities to zero. Strictly, we only need to set the first instance of each body
// in the solver to 0 but, for now, we'll just initialize them all to zero...
for(uint a = threadIndex; a < totalNumBodiesConstraints; a+= blockStride)
{
bodyVelocities[a] = zero;
}
}
__device__ __inline__ float4 loadFloat4(const float4* PX_RESTRICT address)
{
/*float4 ret;
asm("ld.global.cg.v4.f32 {%0, %1, %2, %3}, [%4];" : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : "r"(address));
return ret;*/
return *address;
}
PX_FORCE_INLINE __device__ float4 cudeShuffle3(const PxU32 syncMask, bool condition, float4 reg0, float4 reg1, PxU32 shuffleMask)
{
float4 ret0, ret1;
ret0.x = __shfl_sync(syncMask, reg0.x, shuffleMask);
ret0.y = __shfl_sync(syncMask, reg0.y, shuffleMask);
ret0.z = __shfl_sync(syncMask, reg0.z, shuffleMask);
ret1.x = __shfl_sync(syncMask, reg1.x, shuffleMask);
ret1.y = __shfl_sync(syncMask, reg1.y, shuffleMask);
ret1.z = __shfl_sync(syncMask, reg1.z, shuffleMask);
return condition ? ret0 : ret1;
}
// Marking active slabs loosely following "solveBlockPartition"
static __device__ void markActiveSlab_rigidBodyPGS(
const PxgSolverCoreDesc* PX_RESTRICT solverDesc,
const PxgSolverSharedDesc<IterativeSolveData>* PX_RESTRICT sharedDesc,
const PxU32 islandIndex, const PxU32 lastPartition)
{
const PxgIslandContext& island = solverDesc->islandContextPool[islandIndex];
const PxU32 startPartitionIndex = island.mStartPartitionIndex;
const PxU32 startIndex = island.mBatchStartIndex;
const PxU32 endIndex = solverDesc->constraintsPerPartition[lastPartition + startPartitionIndex];
const IterativeSolveData& iterativeData = sharedDesc->iterativeData;
const PxU32 globalWarpIndex = blockIdx.x * blockDim.y + threadIdx.y;
const PxU32 threadIndexInWarp = threadIdx.x;
const PxU32 batchIndex = startIndex + globalWarpIndex;
if (batchIndex < endIndex)
{
const PxgBlockConstraintBatch& batch = iterativeData.blockConstraintBatch[batchIndex];
if (threadIndexInWarp < batch.mDescStride)
{
const PxU32 bodyOffset = island.mBodyStartIndex;
const PxU32 slabId = batch.slabId[threadIndexInWarp];
const PxU32 outputOffset = solverDesc->accumulatedBodyDeltaVOffset; //deltaVOffset
const PxU32 numDynamicBodies = solverDesc->islandContextPool->mBodyCount; //nbBodies minus offset!
const PxU32 totalBodiesIncKinematics = numDynamicBodies + bodyOffset;
const PxU32 bodyIdA = batch.bodyAIndex[threadIndexInWarp];
const PxU32 bodyIdB = batch.bodyBIndex[threadIndexInWarp];
const PxU32 numArticulations = solverDesc->islandContextPool->mArticulationCount;
const PxU32 numTotalBodies = bodyOffset + numDynamicBodies + numArticulations;
const PxU32 slabIndexOffset = (slabId / 32) * numTotalBodies;
const PxU32 encodedSlabIndex = (1u << (slabId % 32));
bool isActiveSlab = false;
if (batch.constraintType == PxgSolverConstraintDesc::eCONSTRAINT_1D)
{
// For joint constraints, simply mark slabs as active for efficiency.
// This does not significantly affect or increase the reference count in practice.
isActiveSlab = true;
}
else
{
const PxU32 finalIdA = outputOffset + bodyIdA;
const PxU32 finalIdB = outputOffset + bodyIdB;
const float4 linVel0 = Pxldcg(iterativeData.solverBodyVelPool[finalIdA]);
const float4 angVel0 = Pxldcg(iterativeData.solverBodyVelPool[finalIdA + totalBodiesIncKinematics]);
const float4 linVel1 = Pxldcg(iterativeData.solverBodyVelPool[finalIdB]);
const float4 angVel1 = Pxldcg(iterativeData.solverBodyVelPool[finalIdB + totalBodiesIncKinematics]);
PxVec3 lv0(linVel0.x, linVel0.y, linVel0.z);
PxVec3 lv1(linVel1.x, linVel1.y, linVel1.z);
PxVec3 av0(angVel0.x, angVel0.y, angVel0.z);
PxVec3 av1(angVel1.x, angVel1.y, angVel1.z);
// Check if the contact/normal constraint is active.
isActiveSlab = checkActiveContactBlock(batch, lv0, av0, lv1, av1, threadIndexInWarp, iterativeData.blockContactHeaders,
iterativeData.blockContactPoints);
}
// Encode which slab is active in a 32-bit index. When querying reference counts, count the number of active
// slabs encoded in solverEncodedReferenceCount. solverEncodedReferenceCount contains bitwise/slab-wise
// activation information.
if (isActiveSlab & (bodyIdA >= bodyOffset))
{
atomicOr(&iterativeData.solverEncodedReferenceCount[slabIndexOffset + bodyIdA], encodedSlabIndex);
}
if (isActiveSlab & (bodyIdB >= bodyOffset))
{
atomicOr(&iterativeData.solverEncodedReferenceCount[slabIndexOffset + bodyIdB], encodedSlabIndex);
}
}
}
}
// Marking active slabs loosely following "artiSolveBlockPartition"
static __device__ void markActiveSlab_articulationPGS(
const PxgSolverCoreDesc* PX_RESTRICT solverDesc, const PxgSolverSharedDesc<IterativeSolveData>* PX_RESTRICT sharedDesc,
const PxU32 islandIndex, const PxU32 lastPartition, const PxgArticulationCoreDesc* const PX_RESTRICT artiDesc)
{
const PxgIslandContext& island = solverDesc->islandContextPool[islandIndex];
const PxU32 startPartitionIndex = island.mStartPartitionIndex;
const PxU32 startIndex = island.mArtiBatchStartIndex;
const PxU32 articulationBatchOffset = solverDesc->islandContextPool->mBatchCount;
const PxU32 endIndex = solverDesc->artiConstraintsPerPartition[lastPartition + startPartitionIndex] + articulationBatchOffset;
const IterativeSolveData& iterativeData = sharedDesc->iterativeData;
const PxU32 globalWarpIndex = blockIdx.x * blockDim.y + threadIdx.y;
const PxU32 threadIndexInWarp = threadIdx.x;
const PxU32 batchIndex = startIndex + globalWarpIndex + articulationBatchOffset;
const PxU32 bodyOffset = island.mBodyStartIndex;
const PxU32 numDynamicBodies = solverDesc->islandContextPool->mBodyCount; //nbBodies minus offset!
if (batchIndex < endIndex)
{
const PxgBlockConstraintBatch& batch = iterativeData.blockConstraintBatch[batchIndex];
if (threadIndexInWarp < batch.mDescStride)
{
const PxNodeIndex igNodeIndexA = batch.bodyANodeIndex[threadIndexInWarp];
const PxNodeIndex igNodeIndexB = batch.bodyBNodeIndex[threadIndexInWarp];
const PxU32 slabId = batch.slabId[threadIndexInWarp];
const PxU32 nodeIndexA = igNodeIndexA.index();
const PxU32 nodeIndexB = igNodeIndexB.index();
const PxU32 bodyIdA = batch.bodyAIndex[threadIndexInWarp];
const PxU32 bodyIdB = batch.bodyBIndex[threadIndexInWarp];
PxU32 linkIndexA = igNodeIndexA.articulationLinkId();
PxU32 linkIndexB = igNodeIndexB.articulationLinkId();
Cm::UnAlignedSpatialVector vel0, vel1;
const PxU32 numArticulations = solverDesc->islandContextPool->mArticulationCount;
const PxU32 numTotalBodies = bodyOffset + numDynamicBodies + numArticulations;
const PxU32 slabIndexOffset = (slabId / 32) * numTotalBodies;
const PxU32 encodedSlabIndex = (1u << (slabId % 32));
bool isActiveSlab = false;
const PxU32 constraintType = batch.constraintType;
if (constraintType == PxgSolverConstraintDesc::eARTICULATION_CONSTRAINT_1D) // joint
{
// For joint constraints, simply mark slabs as active for efficiency.
// This does not significantly affect or increase the reference count in practice.
isActiveSlab = true;
}
else // contact
{
const PxU32 readIndex = batchIndex * 128 + threadIndexInWarp;
const PxU32 outputOffset = solverDesc->accumulatedBodyDeltaVOffset;
const PxU32 totalBodiesIncKinematics = numDynamicBodies + bodyOffset;
Cm::UnAlignedSpatialVector vel0, vel1;
if (igNodeIndexA.isArticulation())
{
// For articulations, read velocities using readIndex as done in artiSolveBlockPartition.
const float4 lin = Pxldcg(iterativeData.solverBodyVelPool[readIndex]);
const float4 ang = Pxldcg(iterativeData.solverBodyVelPool[readIndex + 32]);
vel0 = Cm::UnAlignedSpatialVector(PxVec3(ang.x, ang.y, ang.z), PxVec3(lin.x, lin.y, lin.z));
}
else
{
// For rigid bodies, use the original rigid body velocity, not a slab velocity, in case velocities
// at readIndex are not set.
const PxU32 finalIdA = outputOffset + bodyIdA;
const float4 lin = Pxldcg(iterativeData.solverBodyVelPool[finalIdA]);
const float4 ang = Pxldcg(iterativeData.solverBodyVelPool[finalIdA + totalBodiesIncKinematics]);
vel0 = Cm::UnAlignedSpatialVector(PxVec3(ang.x, ang.y, ang.z), PxVec3(lin.x, lin.y, lin.z));
}
if (igNodeIndexB.isArticulation())
{
// For articulations, read velocities using readIndex as done in artiSolveBlockPartition.
const float4 lin = Pxldcg(iterativeData.solverBodyVelPool[readIndex + 64]);
const float4 ang = Pxldcg(iterativeData.solverBodyVelPool[readIndex + 96]);
vel1 = Cm::UnAlignedSpatialVector(PxVec3(ang.x, ang.y, ang.z), PxVec3(lin.x, lin.y, lin.z));
}
else
{
// For rigid bodies, use the original rigid body velocity, not a slab velocity, in case velocities
// at readIndex are not set.
const PxU32 finalIdB = outputOffset + bodyIdB;
const float4 lin = Pxldcg(iterativeData.solverBodyVelPool[finalIdB]);
const float4 ang = Pxldcg(iterativeData.solverBodyVelPool[finalIdB + totalBodiesIncKinematics]);
vel1 = Cm::UnAlignedSpatialVector(PxVec3(ang.x, ang.y, ang.z), PxVec3(lin.x, lin.y, lin.z));
}
// Check if the contact/normal constraint is active.
isActiveSlab = checkExtActiveContactBlock(batch, vel0, vel1, iterativeData.blockContactHeaders,
iterativeData.blockContactPoints, iterativeData.artiResponse, threadIndexInWarp);
}
if (isActiveSlab)
{
// Encode which slab is active in a 32-bit index. When querying reference counts, count the number of
// active slabs encoded in solverEncodedReferenceCount. solverEncodedReferenceCount contains
// bitwise/slab-wise activation information.
if (igNodeIndexA.isArticulation()) // articulation
{
const PxU32 articulationBodyIdA = batch.remappedBodyAIndex[threadIndexInWarp];
// Articulation IDs are at the back of rigid body IDs.
const PxU32 globalBodyIdA = articulationBodyIdA + numDynamicBodies + bodyOffset;
atomicOr(&iterativeData.solverEncodedReferenceCount[slabIndexOffset + globalBodyIdA], encodedSlabIndex);
}
else if (bodyIdA >= bodyOffset) // rigid
{
atomicOr(&iterativeData.solverEncodedReferenceCount[slabIndexOffset + bodyIdA], encodedSlabIndex);
}
if (igNodeIndexB.isArticulation()) // articulation
{
const PxU32 articulationBodyIdB = batch.remappedBodyBIndex[threadIndexInWarp];
// Articulation IDs are at the back of rigid body IDs.
const PxU32 globalBodyIdB = articulationBodyIdB + numDynamicBodies + bodyOffset;
atomicOr(&iterativeData.solverEncodedReferenceCount[slabIndexOffset + globalBodyIdB], encodedSlabIndex);
}
else if (bodyIdB >= bodyOffset) // rigid
{
atomicOr(&iterativeData.solverEncodedReferenceCount[slabIndexOffset + bodyIdB], encodedSlabIndex);
}
}
}
}
}
// Marking active rigid body slabs, loosely following "solveBlockPartition" and "artiSolveBlockPartition"
extern "C" __global__
__launch_bounds__(PxgKernelBlockDim::SOLVE_BLOCK_PARTITION, 8)
void markActiveSlabPGS(const PxgSolverCoreDesc * PX_RESTRICT solverDesc,
const PxgSolverSharedDesc<IterativeSolveData>* PX_RESTRICT sharedDesc,
const PxU32 islandIndex, const PxU32 lastPartition, const PxgArticulationCoreDesc* const PX_RESTRICT artiDesc)
{
if (blockIdx.y == 0)
{
markActiveSlab_rigidBodyPGS(solverDesc, sharedDesc, islandIndex, lastPartition);
}
else
{
markActiveSlab_articulationPGS(solverDesc, sharedDesc, islandIndex, lastPartition, artiDesc);
}
}
extern "C" __global__
//__launch_bounds__(PxgKernelBlockDim::SOLVE_BLOCK_PARTITION, 16)
void solveBlockPartition(
PxgSolverCoreDesc* PX_RESTRICT solverDesc, const PxgSolverSharedDesc<IterativeSolveData>* PX_RESTRICT sharedDesc,
const PxU32 islandIndex, const PxU32 partitionIndex, bool doFriction)
{
const PxgIslandContext& island = solverDesc->islandContextPool[islandIndex];
const PxU32 startPartitionIndex = island.mStartPartitionIndex;
PxU32 startIndex = partitionIndex == 0 ? island.mBatchStartIndex : solverDesc->constraintsPerPartition[partitionIndex + startPartitionIndex - 1];
PxU32 endIndex = solverDesc->constraintsPerPartition[partitionIndex + startPartitionIndex];
const uint warpSize = 32;
//This identifies which warp a specific thread is in
const uint warpIndex = (threadIdx.x + blockIdx.x * blockDim.x)/warpSize;
//This identifies which thread within a warp a specific thread is
const uint threadIndexInWarp = threadIdx.x&(warpSize-1);
__shared__ IterativeSolveData iterativeData;
PxU32 idx = threadIdx.x;
//if(threadIdx.x < sizeof(iterativeData)/sizeof(float))
while(idx < sizeof(iterativeData) / sizeof(float))
{
float* iterData = reinterpret_cast<float*>(&iterativeData);
iterData[idx] = reinterpret_cast<const float*>(&sharedDesc->iterativeData)[idx];
idx += warpSize;
}
__syncthreads();
bool residualAccumulationEnabled = solverDesc->contactErrorAccumulator.mCounter >= 0;
PxgErrorAccumulator error;
//for(uint k = startIndex + warpIndex; k < endIndex; k+=blockStride)
uint k = startIndex + warpIndex;
if(k < endIndex)
{
assert(k < solverDesc->numBatches);
const PxgBlockConstraintBatch& batch = iterativeData.blockConstraintBatch[k];
const PxU32 readIndex = k*128 + threadIndexInWarp;
//Pull out shared memory into float4 format in registers to solve constraints
if(threadIndexInWarp < batch.mDescStride)
{
//The linear/angular velocity pair for body 0 for threads 0 - 15, loaded by threads 0 - 31 in the format (b0.linVel, b0.angVel, b0.linVel, b0.angVel...)
float4 linVel0 = Pxldcg(iterativeData.solverBodyVelPool[readIndex]);
//The linear/angular velocity pair for body 0 for threads 16-31, loaded by threads 0 - 31 in the format (b0.linVel, b0.angVel, b0.linVel, b0.angVel...)
float4 angVel0 = Pxldcg(iterativeData.solverBodyVelPool[readIndex + 32]);
//The linear/angular velocity pair for body 1 for threads 0 - 15, loaded by threads 0 - 31 in the format (b1.linVel, b1.angVel, b1.linVel, b1.angVel...)
float4 linVel1 = Pxldcg(iterativeData.solverBodyVelPool[readIndex + 64]);
//The linear/angular velocity pair for body 1 for threads 16 - 31, loaded by threads 0 - 31 in the format (b1.linVel, b1.angVel, b1.linVel, b1.angVel...)
float4 angVel1 = Pxldcg(iterativeData.solverBodyVelPool[readIndex + 96]);
//printf("Rigid ReadIndex = %i, linVel0=(%f, %f, %f), angVel0=(%f, %f, %f)\n", readIndex, linVel0.x, linVel0.y, linVel0.z, angVel0.x, angVel0.y, angVel0.z);
//printf("Rigid ReadIndex = %i, linVel0=(%f, %f, %f), angVel0=(%f, %f, %f)\n", readIndex+64, linVel1.x, linVel1.y, linVel1.z, angVel1.x, angVel1.y, angVel1.z);
PxVec3 lv0(linVel0.x, linVel0.y, linVel0.z);
PxVec3 lv1(linVel1.x, linVel1.y, linVel1.z);
PxVec3 av0(angVel0.x, angVel0.y, angVel0.z);
PxVec3 av1(angVel1.x, angVel1.y, angVel1.z);
// Reference counts to be used for the current sub-timestep or iteration.
PxReal curRef0 = 1.0f;
PxReal curRef1 = 1.0f;
const PxU32 bodyOffset = solverDesc->islandContextPool->mBodyStartIndex;
const PxU32 numDynamicBodies = solverDesc->islandContextPool->mBodyCount; //nbBodies minus offset!
const PxU32 numArticulations = solverDesc->islandContextPool->mArticulationCount;
const PxU32 numTotalBodies = bodyOffset + numDynamicBodies + numArticulations;
const PxU32* const PX_RESTRICT encodedReferenceCount = sharedDesc->iterativeData.solverEncodedReferenceCount;
if(batch.bodyAIndex[threadIndexInWarp] >= bodyOffset)
{
// Counting the number of active slabs
curRef0 = static_cast<PxReal>(
countActiveSlabs(batch.bodyAIndex[threadIndexInWarp], solverDesc->numSlabs, numTotalBodies, encodedReferenceCount));
}
if(batch.bodyBIndex[threadIndexInWarp] >= bodyOffset)
{
// Counting the number of active slabs
curRef1 = static_cast<PxReal>(
countActiveSlabs(batch.bodyBIndex[threadIndexInWarp], solverDesc->numSlabs, numTotalBodies, encodedReferenceCount));
}
if (batch.constraintType == PxgSolverConstraintDesc::eCONTACT)
solveContactBlock(batch, lv0, av0, lv1, av1, doFriction, threadIndexInWarp, iterativeData.blockContactHeaders, iterativeData.blockFrictionHeaders,
iterativeData.blockContactPoints, iterativeData.blockFrictions, residualAccumulationEnabled ? &error : NULL,
curRef0, curRef1);
else
solve1DBlock(batch, lv0, av0, lv1, av1, threadIndexInWarp, iterativeData.blockJointConstraintHeaders, iterativeData.blockJointConstraintRowsCon,
iterativeData.blockJointConstraintRowsMod, solverDesc->contactErrorAccumulator.mCounter >= 0, curRef0, curRef1);
const PxU32 remapA = batch.remappedBodyAIndex[threadIndexInWarp];
const PxU32 remapB = batch.remappedBodyBIndex[threadIndexInWarp];
const PxU32 indexA = ComputeBodyBatchStartIndex(remapA);
const PxU32 indexB = ComputeBodyBatchStartIndex(remapB);
Pxstcg(&iterativeData.solverBodyVelPool[indexA], make_float4(lv0.x, lv0.y, lv0.z, 0.f));
Pxstcg(&iterativeData.solverBodyVelPool[indexA + 32], make_float4(av0.x, av0.y, av0.z, 0.f));
Pxstcg(&iterativeData.solverBodyVelPool[indexB], make_float4(lv1.x, lv1.y, lv1.z, 0.f));
Pxstcg(&iterativeData.solverBodyVelPool[indexB + 32], make_float4(av1.x, av1.y, av1.z, 0.f));
}
//KS - Even threads in a warp output the linear velocities, odd threads output the angular velocities.
//We output for threadIndexInWarp/2 and 16 + threadIndexInWarp/2
#if 0
const PxU32 firstWarpOutput = threadIndexInWarp/2;
const PxU32 secondWarpOutput = threadIndexInWarp/2 + 16;
float4 indexAOutput0 = cudeShuffle3(threadIndexInWarp & 1, angVel0, linVel0, firstWarpOutput);
float4 indexBOutput0 = cudeShuffle3(threadIndexInWarp & 1, angVel1, linVel1, firstWarpOutput);
float4 indexAOutput1 = cudeShuffle3(threadIndexInWarp & 1, angVel0, linVel0, secondWarpOutput);
float4 indexBOutput1 = cudeShuffle3(threadIndexInWarp & 1, angVel1, linVel1, secondWarpOutput);
if(firstWarpOutput < batch.mDescStride)
{
const PxU32 indexA = (2*batch.remappedBodyAIndex[firstWarpOutput] + (threadIdx.x&1));
const PxU32 indexB = (2*batch.remappedBodyBIndex[firstWarpOutput] + (threadIdx.x&1));
iterativeData.solverBodyVelPool[indexA] = indexAOutput0;
iterativeData.solverBodyVelPool[indexB] = indexBOutput0;
}
if(secondWarpOutput < batch.mDescStride)
{
const PxU32 indexA = (2*batch.remappedBodyAIndex[secondWarpOutput] + (threadIdx.x&1));
const PxU32 indexB = (2*batch.remappedBodyBIndex[secondWarpOutput] + (threadIdx.x&1));
iterativeData.solverBodyVelPool[indexA] = indexAOutput1;
iterativeData.solverBodyVelPool[indexB] = indexBOutput1;
}
#else
#endif
}
if (residualAccumulationEnabled)
{
error.accumulateErrorGlobalFullWarp(solverDesc->contactErrorAccumulator, threadIndexInWarp);
}
}
extern "C" __global__ void writebackBlocks(
PxgSolverCoreDesc* constraintPrepDesc,
PxgSolverSharedDesc<IterativeSolveData>* sharedDesc, const PxU32 islandIndex)
{
PxgIslandContext& island = constraintPrepDesc->islandContextPool[islandIndex];
const PxU32 startIndex = island.mBatchStartIndex;
const PxU32 endIndex = island.mBatchCount + island.mArtiBatchCount + island.mStaticArtiBatchCount +
island.mSelfArtiBatchCount + island.mStaticRigidBatchCount + startIndex;
const uint warpSize = 32;
//const uint blockStride = (blockDim.x * gridDim.x)/warpSize;
//This identifies which warp a specific thread is in
const uint warpIndex = (threadIdx.x + blockIdx.x * blockDim.x)/warpSize;
const uint warpIndexInBlock = threadIdx.x/warpSize;
//This identifies which thread within a warp a specific thread is
const uint threadIndexInWarp = threadIdx.x&(warpSize-1);
PxgBlockConstraintBatch* batchHeaders = sharedDesc->iterativeData.blockConstraintBatch;
PxgBlockSolverConstraint1DHeader* jointHeaders = sharedDesc->iterativeData.blockJointConstraintHeaders;
PxgBlockSolverConstraint1DCon* jointRowsCon = sharedDesc->iterativeData.blockJointConstraintRowsCon;
PxgBlockSolverConstraint1DMod* jointRowsMod = sharedDesc->iterativeData.blockJointConstraintRowsMod;
PxgConstraintWriteback* constraintWriteBack = constraintPrepDesc->constraintWriteBack;
PxgSolverBodyData* solverBodyDatas = constraintPrepDesc->solverBodyDataPool;
PxgBlockSolverContactHeader* contactHeaders = sharedDesc->iterativeData.blockContactHeaders;
PxgBlockSolverFrictionHeader* frictionHeaders = sharedDesc->iterativeData.blockFrictionHeaders;
PxgBlockSolverContactPoint* contactPoints = sharedDesc->iterativeData.blockContactPoints;
PxgBlockSolverContactFriction* frictions = sharedDesc->iterativeData.blockFrictions;
PxgBlockFrictionPatch* baseFrictionPatches = sharedDesc->blockCurrentFrictionPatches;
PxF32* baseWritebackForceBuffer = constraintPrepDesc->forceBuffer;
PxgFrictionPatchGPU* frictionPatches = reinterpret_cast<PxgFrictionPatchGPU*>(constraintPrepDesc->frictionPatches);
//__shared__ Dy::ThresholdStreamElement elems[PxgKernelBlockDim::WRITEBACK_BLOCKS];
__shared__ PxU8 elemsMem[sizeof(Dy::ThresholdStreamElement)*PxgKernelBlockDim::WRITEBACK_BLOCKS];
Dy::ThresholdStreamElement* elems = reinterpret_cast<Dy::ThresholdStreamElement*>(elemsMem);
__shared__ PxI32 index[PxgKernelBlockDim::WRITEBACK_BLOCKS/warpSize];
Dy::ThresholdStreamElement* startAddress = &elems[32*warpIndexInBlock];
//for(uint k = startIndex + warpIndex; k < endIndex; k+=blockStride)
uint k = startIndex + warpIndex;
if(k < endIndex)
{
if(threadIndexInWarp == 0)
index[warpIndexInBlock] = 0;
__syncwarp();
//Get block header
const PxgBlockConstraintBatch& batch = batchHeaders[k];
if(threadIndexInWarp < batch.mDescStride)
{
if(batch.constraintType==PxgSolverConstraintDesc::eCONTACT || batch.constraintType == PxgSolverConstraintDesc::eARTICULATION_CONTACT)
{
writeBackContactBlock(batch, threadIndexInWarp, solverBodyDatas, startAddress, &index[warpIndexInBlock], contactHeaders, frictionHeaders, contactPoints, frictions,
baseWritebackForceBuffer, baseFrictionPatches[batch.mConstraintBatchIndex], frictionPatches);
}
else
{
//Do nothing (for now)
writeBack1DBlock(batch, threadIndexInWarp, jointHeaders, jointRowsCon, jointRowsMod, constraintWriteBack);
}
}
//__syncthreads();
__syncwarp();
PxI32 ind = index[warpIndexInBlock];
if(ind > 0)
{
__shared__ PxI32 startIndex[PxgKernelBlockDim::WRITEBACK_BLOCKS/warpSize];
if(threadIndexInWarp == 0)
{
startIndex[warpIndexInBlock] = atomicAdd(&constraintPrepDesc->sharedThresholdStreamIndex, ind);
}
__syncwarp();
if(threadIndexInWarp < ind)
{
//((float4*)constraintPrepDesc->thresholdStream)[startIndex[warpIndexInBlock]+threadIndexInWarp] = ((float4*)startAddress)[threadIndexInWarp];
constraintPrepDesc->thresholdStream[startIndex[warpIndexInBlock]+threadIndexInWarp] = startAddress[threadIndexInWarp];
}
}
}
}
extern "C" __global__ void concludeBlocks(
PxgSolverCoreDesc* constraintPrepDesc,
PxgSolverSharedDesc<IterativeSolveData>* sharedDesc, const PxU32 islandIndex)
{
PxgIslandContext& island = constraintPrepDesc->islandContextPool[islandIndex];
const PxU32 startIndex = island.mBatchStartIndex;
const PxU32 endIndex = island.mBatchCount + island.mArtiBatchCount + island.mStaticArtiBatchCount
+ island.mSelfArtiBatchCount + island.mStaticRigidBatchCount + startIndex;
const uint warpSize = 32;
//const uint blockStride = (blockDim.x * gridDim.x)/warpSize;
//This identifies which warp a specific thread is in
const uint warpIndex = (threadIdx.x + blockIdx.x * blockDim.x)/warpSize;
//This identifies which thread within a warp a specific thread is
const uint threadIndexInWarp = threadIdx.x&(warpSize-1);
PxgBlockConstraintBatch* batchHeaders = sharedDesc->iterativeData.blockConstraintBatch;
PxgBlockSolverConstraint1DHeader* jointHeaders = sharedDesc->iterativeData.blockJointConstraintHeaders;
PxgBlockSolverConstraint1DMod* jointRowsMod = sharedDesc->iterativeData.blockJointConstraintRowsMod;
PxgBlockSolverContactHeader* contactHeaders = sharedDesc->iterativeData.blockContactHeaders;
PxgBlockSolverFrictionHeader* frictionHeaders = sharedDesc->iterativeData.blockFrictionHeaders;
PxgBlockSolverContactPoint* contactPoints = sharedDesc->iterativeData.blockContactPoints;
PxgBlockSolverContactFriction* frictions = sharedDesc->iterativeData.blockFrictions;
//for(uint k = startIndex + warpIndex; k < endIndex; k+=blockStride)
uint k = startIndex + warpIndex;
if(k < endIndex)
{
//Get block header
const PxgBlockConstraintBatch& batch = batchHeaders[k];
if(threadIndexInWarp < batch.mDescStride)
{
if(batch.constraintType==PxgSolverConstraintDesc::eCONTACT || batch.constraintType == PxgSolverConstraintDesc::eARTICULATION_CONTACT)
{
concludeContactBlock(batch, threadIndexInWarp, contactHeaders, frictionHeaders, contactPoints, frictions);
}
else
{
conclude1DBlock(batch, threadIndexInWarp, jointHeaders, jointRowsMod);
}
}
}
}
extern "C" __global__ void writeBackBodies(
const PxgSolverCoreDesc* constraintPrepDesc,
const PxgSolverSharedDesc<IterativeSolveData>* sharedDesc, const PxU32 islandIndex)
{
__shared__ float4* bodyVelocities;
__shared__ float4* motionVelocities;
__shared__ PxU32 bodyStartIndex;
__shared__ PxU32 bodyEndIndex;
__shared__ PxU32 totalBodyCount;
__shared__ PxU32 outputOffsetIndex;
if(threadIdx.x == 0)
{
PxgIslandContext& island = constraintPrepDesc->islandContextPool[islandIndex];
bodyVelocities = sharedDesc->iterativeData.solverBodyVelPool;
motionVelocities = constraintPrepDesc->motionVelocityArray;
bodyStartIndex = island.mBodyStartIndex;
totalBodyCount = constraintPrepDesc->numSolverBodies;
bodyEndIndex = totalBodyCount;
outputOffsetIndex = sharedDesc->deltaOutOffset;
}
__syncthreads();
//const uint blockStride = (blockDim.x * gridDim.x);
//This identifies which warp a specific thread is in
const uint threadIndex = (threadIdx.x + blockIdx.x * blockDim.x);
//for(uint a = threadIndex+bodyStartIndex; a < bodyEndIndex; a+=blockStride)
uint a = threadIndex+bodyStartIndex;
if(a < bodyEndIndex)
{
motionVelocities[a] = bodyVelocities[a + outputOffsetIndex]; //Linear velocity
motionVelocities[a+totalBodyCount] = bodyVelocities[a + outputOffsetIndex + totalBodyCount]; //Angular velocity stored after all linear velocities
}
}
extern "C" __global__ void computeAverageSolverBodyVelocity(
const PxgSolverCoreDesc* const PX_RESTRICT solverDesc,
const PxgSolverSharedDesc<IterativeSolveData>* PX_RESTRICT sharedDesc)
{
//we need to fill in the writeIndex for the solver body data
const PxgSolverReferences* const PX_RESTRICT solverReferences = solverDesc->solverBodyReferences;
float4* bodyVelocities = sharedDesc->iterativeData.solverBodyVelPool;
PxU32* encodedReferenceCount = sharedDesc->iterativeData.solverEncodedReferenceCount;
const PxU32 totalBodyCount = solverDesc->islandContextPool->mBodyCount; //nbBodies minus offset!
const PxU32 bodyOffset = solverDesc->islandContextPool->mBodyStartIndex;
const PxU32 numSlabs = solverDesc->numSlabs;
const PxU32 numBatches = solverDesc->numBatches;
const PxU32 numArticBatches = solverDesc->numArticBatches;
const PxU32 deltaVOffset = solverDesc->accumulatedBodyDeltaVOffset;
const PxU32 averageOutputOffsets = (numBatches + numArticBatches) * PXG_BATCH_SIZE * 2 * 2;
const PxU32 numThreadsPerBody = PxMin(isPowerOfTwo(numSlabs) ? numSlabs : nextPowerOfTwo(numSlabs), 32u);
const PxU32 numBodiesPerWarp = WARP_SIZE / numThreadsPerBody;
const uint warpIndex = (blockIdx.x * blockDim.y + threadIdx.y);
const uint warpBodyStartIndex = warpIndex * numBodiesPerWarp;
const PxU32 threadIdInWorkUnit = threadIdx.x & (numThreadsPerBody - 1);
const PxU32 subWarpIndex = threadIdx.x / numThreadsPerBody;
const PxU32 bodyInWarp = threadIdx.x / numThreadsPerBody;
const PxU32 maskHigh = ((1 << ((subWarpIndex + 1) * numThreadsPerBody)) - 1);
const PxU32 maskLow = ((1 << ((subWarpIndex)*numThreadsPerBody)) - 1);
const PxU32 mask = maskHigh - maskLow;
const uint bodyId = warpBodyStartIndex + bodyInWarp;
if (bodyId < totalBodyCount)
{
float4 linDelta = make_float4(0.f);
float4 angDelta = make_float4(0.f);
const PxU32 outputBody = deltaVOffset + bodyId + bodyOffset;
//Store linear and angular velocity!!!
const float4 lastLinearVel = bodyVelocities[outputBody];
const float4 lastAngularVel = bodyVelocities[outputBody + totalBodyCount + bodyOffset];
bool hasRef = false;
//get out velocity for the correponding body in a slab and put on weight to calculate the final velocity for that body
for (uint b = threadIdInWorkUnit; b < numSlabs; b += numThreadsPerBody)
{
const uint bodyIndex = bodyId * numSlabs + b;
PxgSolverReferences reference = solverReferences[bodyIndex];
PxU32 remappedBodyIndex = reference.mRemappedBodyIndex;
if (remappedBodyIndex != 0xFFFFFFFF)
{
//Outputs are grouped into little clusters of 32...
const uint velIndex = averageOutputOffsets + ComputeAverageBodyBatchStartIndex(bodyIndex);
const float4 curLinearDeltaV = bodyVelocities[velIndex];
const float4 curAngularDeltaV = bodyVelocities[velIndex + 32];
float4 lDelta = (curLinearDeltaV - lastLinearVel);
float4 aDelta = (curAngularDeltaV - lastAngularVel);
{
linDelta += lDelta;
angDelta += aDelta;
}
hasRef = true;
}
}
//Now do the reduction...
#pragma unroll
for (PxU32 reductionRadius = numThreadsPerBody >> 1; reductionRadius > 0; reductionRadius >>= 1)
{
linDelta.x += __shfl_xor_sync(mask, linDelta.x, reductionRadius, numThreadsPerBody);
linDelta.y += __shfl_xor_sync(mask, linDelta.y, reductionRadius, numThreadsPerBody);
linDelta.z += __shfl_xor_sync(mask, linDelta.z, reductionRadius, numThreadsPerBody);
angDelta.x += __shfl_xor_sync(mask, angDelta.x, reductionRadius, numThreadsPerBody);
angDelta.y += __shfl_xor_sync(mask, angDelta.y, reductionRadius, numThreadsPerBody);
angDelta.z += __shfl_xor_sync(mask, angDelta.z, reductionRadius, numThreadsPerBody);
}
hasRef = (__ballot_sync(mask, hasRef) & mask);
if (hasRef)
{
if (threadIdInWorkUnit == 0)
{
// Counting the number of active slabs
const PxU32 numDynamicBodies = solverDesc->islandContextPool->mBodyCount; //nbBodies minus offset!
const PxU32 numArticulations = solverDesc->islandContextPool->mArticulationCount;
const PxU32 numTotalBodies = bodyOffset + numDynamicBodies + numArticulations;
const PxU32 referenceCount =
countActiveSlabs(bodyOffset + bodyId, solverDesc->numSlabs, numTotalBodies, encodedReferenceCount);
const PxReal recipRefs = 1.f / static_cast<PxReal>(referenceCount);
linDelta = linDelta * recipRefs;
angDelta = angDelta * recipRefs;
// Resetting rigid body reference count
resetSlabCount(bodyOffset + bodyId, solverDesc->numSlabs, numTotalBodies, encodedReferenceCount);
//Store linear and angular velocity!!!
bodyVelocities[outputBody] = lastLinearVel + linDelta;
bodyVelocities[outputBody + totalBodyCount + bodyOffset] = lastAngularVel + angDelta;
}
}
}
}
extern "C" __global__ void propagateSolverBodyVelocity(
const PxgSolverCoreDesc* solverDesc,
const PxgSolverSharedDesc<IterativeSolveData>* sharedDesc)
{
//we need to fill in the writeIndex for the solver body data
PxgSolverReferences* solverReferences = solverDesc->solverBodyReferences;
float4* bodyVelocities = sharedDesc->iterativeData.solverBodyVelPool;
const PxU32 totalBodyCount = solverDesc->islandContextPool->mBodyCount; //nbBodies minus offset!
const PxU32 bodyOffset = solverDesc->islandContextPool->mBodyStartIndex; //nbBodies minus offset!
const PxU32 numSlabs = solverDesc->numSlabs;
const PxU32 deltaVOffset = solverDesc->accumulatedBodyDeltaVOffset;
//This identifies which warp a specific thread is in
const uint threadIndex = (threadIdx.x + blockIdx.x * blockDim.x);
const PxU32 numThreadsPerBody = PxMin(isPowerOfTwo(numSlabs) ? numSlabs : nextPowerOfTwo(numSlabs), 32u);
const PxU32 numBodiesPerWarp = WARP_SIZE / numThreadsPerBody;
const PxU32 warpIndex = threadIndex / WARP_SIZE;
const uint warpBodyStartIndex = warpIndex * numBodiesPerWarp;
const PxU32 threadIdInWorkUnit = threadIdx.x&(numThreadsPerBody - 1);
const PxU32 bodyInWarp = (threadIdx.x&31) / numThreadsPerBody;
const uint bodyId = warpBodyStartIndex + bodyInWarp;
if(bodyId < totalBodyCount)
{
const PxU32 outputBody = deltaVOffset + bodyId + bodyOffset;
//Store linear and angular velocity!!!
float4 linVel = bodyVelocities[outputBody];
float4 angVel = bodyVelocities[outputBody + totalBodyCount + bodyOffset];
linVel.w = 0.f;
for(uint b=threadIdInWorkUnit; b<numSlabs; b += numThreadsPerBody)
{
const uint bodyIndex = (bodyId)*numSlabs + b;
PxgSolverReferences reference = solverReferences[bodyIndex];
PxU32 remappedBodyIndex = reference.mRemappedBodyIndex;
if(remappedBodyIndex != 0xFFFFFFFF)
{
const uint velIndex = ComputeBodyBatchStartIndex(remappedBodyIndex);
bodyVelocities[velIndex] = linVel;
bodyVelocities[velIndex + 32] = angVel;
}
}
}
}
extern "C" __global__ void dmaBackChangedElems(const PxgSolverCoreDesc* solverDesc, Dy::ThresholdStreamElement* hostChangedElems)
{
Dy::ThresholdStreamElement* changeElems = solverDesc->forceChangeThresholdElements;
PxU32 nbElemsChanges = solverDesc->nbForceChangeElements;
PxU32 nbThreadsRequired = (sizeof(Dy::ThresholdStreamElement) * nbElemsChanges)/sizeof(PxU32);
PxU32* src = reinterpret_cast<PxU32*>(changeElems);
PxU32* dst = reinterpret_cast<PxU32*>(hostChangedElems);
PxU32 globalThreadIdx = threadIdx.x + blockIdx.x * blockDim.x;
for(PxU32 i = globalThreadIdx; i < nbThreadsRequired; i+= blockDim.x * gridDim.x)
{
dst[i] = src[i];
}
}
extern "C" __global__ void dmaConstraintResidual(const PxgConstraintWriteback* writebacks, PxReal* residuals, PxU32 count)
{
PxU32 globalThreadIdx = threadIdx.x + blockIdx.x * blockDim.x;
if (globalThreadIdx < count)
{
residuals[globalThreadIdx] = writebacks[globalThreadIdx].angularImpulse_residual.w;
}
}
extern "C" __global__
//__launch_bounds__(PxgKernelBlockDim::SOLVE_BLOCK_PARTITION, 16)
void solveStaticBlock(
PxgSolverCoreDesc* PX_RESTRICT solverDesc, const PxgSolverSharedDesc<IterativeSolveData>* PX_RESTRICT sharedDesc,
const PxU32 islandIndex, const PxU32 nbStaticSlabs, const PxU32 maxStaticPartitions, bool doFriction)
{
const PxgIslandContext& island = solverDesc->islandContextPool[islandIndex];
const IterativeSolveData& iterativeData = sharedDesc->iterativeData;
float4* PX_RESTRICT bodyVelocities = iterativeData.solverBodyVelPool;
float4* PX_RESTRICT bodyOutVelocities = iterativeData.tempStaticBodyOutputPool;
const PxU32 numDynamicBodies = island.mBodyCount; //nbBodies minus offset!
const PxU32 bodyOffset = island.mBodyStartIndex;
const PxU32 deltaVOffset = solverDesc->accumulatedBodyDeltaVOffset;
const PxU32 totalBodiesIncKinematics = numDynamicBodies + bodyOffset;
const PxU32 numDynamicBodiesRounded = (island.mBodyCount + 31)&(~31); //Rounded to a multiple of 32
const uint warpSize = 32;
//This identifies which warp a specific thread is in
const uint globalThreadIdx = (threadIdx.x + blockIdx.x * blockDim.x);
//This identifies which thread within a warp a specific thread is
const uint threadIndexInWarp = threadIdx.x&(warpSize - 1);
const PxU32 nbDynamicBodiesToSolve = numDynamicBodiesRounded * nbStaticSlabs;
const uint bodyIndex = globalThreadIdx % numDynamicBodiesRounded;
const PxU32 slabIdx = (globalThreadIdx / numDynamicBodiesRounded);
const PxU32 startIndex = slabIdx * maxStaticPartitions;
bool residualAccumulationEnabled = solverDesc->contactErrorAccumulator.mCounter >= 0;
PxgErrorAccumulator error;
if (globalThreadIdx < nbDynamicBodiesToSolve)
{
if (bodyIndex < numDynamicBodies)
{
PxU32 contactCount = PxMin(maxStaticPartitions, PxMax(startIndex, solverDesc->mRigidStaticContactCounts[bodyIndex]) - startIndex);
PxU32 jointCount = PxMin(maxStaticPartitions, PxMax(startIndex, solverDesc->mRigidStaticJointCounts[bodyIndex]) - startIndex);
const PxU32 outputBody = slabIdx * totalBodiesIncKinematics * 2;
//printf("BodyIndex = %i, globalThreadIdx = %i, nbDynamicBodiesToSolve = %i, nbStaticSlabs = %i, contactCount = %i, startIndex = %i, outputBody = %i\n", bodyIndex, globalThreadIdx,
// nbDynamicBodiesToSolve, nbStaticSlabs, contactCount, startIndex, outputBody);
if (contactCount != 0 || jointCount != 0)
{
//We have some constraints to solve...
const PxU32 startContactIndex = solverDesc->mRigidStaticContactStartIndices[bodyIndex] + startIndex;
const PxU32 startJointIndex = solverDesc->mRigidStaticJointStartIndices[bodyIndex] + startIndex;
assert(startContactIndex >= solverDesc->numBatches);
const PxU32 inputBody = deltaVOffset + bodyIndex + bodyOffset;
//Load in velocity data...
float4 linVel = bodyVelocities[inputBody];
float4 angVel = bodyVelocities[inputBody + totalBodiesIncKinematics];
PxVec3 lv0(linVel.x, linVel.y, linVel.z);
PxVec3 lv1(0.f);
PxVec3 av0(angVel.x, angVel.y, angVel.z);
PxVec3 av1(0.f);
for (PxU32 i = 0; i < jointCount; ++i)
{
const PxgBlockConstraintBatch& batch = iterativeData.blockConstraintBatch[startJointIndex + i];
assert(batch.constraintType == PxgSolverConstraintDesc::eCONSTRAINT_1D);
PxU32 idx = warpScanExclusive(batch.mask, threadIndexInWarp);
// For interaction with static objects, mass-splitting is not used; thus, reference counts are 1.
solve1DBlock(batch, lv0, av0, lv1, av1, idx, iterativeData.blockJointConstraintHeaders, iterativeData.blockJointConstraintRowsCon,
iterativeData.blockJointConstraintRowsMod, solverDesc->contactErrorAccumulator.mCounter >= 0, 1.f, 1.f);
}
for (PxU32 i = 0; i < contactCount; ++i)
{
const PxgBlockConstraintBatch& batch = iterativeData.blockConstraintBatch[startContactIndex + i];
assert(batch.constraintType == PxgSolverConstraintDesc::eCONTACT);
PxU32 idx = warpScanExclusive(batch.mask, threadIndexInWarp);
// For interaction with static objects, mass-splitting is not used; thus, reference counts are 1.
solveContactBlock(batch, lv0, av0, lv1, av1, doFriction, idx, iterativeData.blockContactHeaders, iterativeData.blockFrictionHeaders,
iterativeData.blockContactPoints, iterativeData.blockFrictions, residualAccumulationEnabled ? &error : NULL,
1.f, 1.f);
}
//if (globalThreadIdx == 33)
////if(startContactIndex == (solverDesc->numBatches))
////if(warpScanExclusive(iterativeData.blockConstraintBatch[startContactIndex].mask, threadIndexInWarp) == 0)
//{
// printf("%i: NumContacts = %i, beforeVel = (%f, %f, %f), afterlinVel = (%f, %f, %f), lv1 (%f, %f, %f), av1(%f, %f, %f), startContactIndex = %i, numBatches = %i\n",
// globalThreadIdx, contactCount, linVel.x, linVel.y, linVel.z, lv0.x, lv0.y, lv0.z, lv1.x, lv1.y, lv1.z, av1.x, av1.y, av1.z, startContactIndex, solverDesc->numBatches);
//}
linVel.x = lv0.x; linVel.y = lv0.y; linVel.z = lv0.z;
angVel.x = av0.x; angVel.y = av0.y; angVel.z = av0.z;
/*printf("%i: BodyOutVelocities[%i] = (%f, %f, %f, %f), bodyOutVelocities[%i] = (%f, %f, %f, %f)\n",
globalThreadIdx, bodyIndex + outputBody, linVel.x, linVel.y, linVel.z, linVel.w,
bodyIndex + outputBody + totalBodiesIncKinematics, angVel.x, angVel.y, angVel.z, angVel.w);*/
bodyOutVelocities[bodyIndex + outputBody] = linVel;
bodyOutVelocities[bodyIndex + outputBody + totalBodiesIncKinematics] = angVel;
}
}
}
if (residualAccumulationEnabled)
{
error.accumulateErrorGlobalFullWarp(solverDesc->contactErrorAccumulator, threadIndexInWarp);
}
}
extern "C" __global__
//__launch_bounds__(PxgKernelBlockDim::SOLVE_BLOCK_PARTITION, 16)
void propagateStaticSolverBodyVelocities(
const PxgSolverCoreDesc* PX_RESTRICT solverDesc, const PxgSolverSharedDesc<IterativeSolveData>* PX_RESTRICT sharedDesc,
const PxU32 islandIndex, const PxU32 nbStaticSlabs, const PxU32 maxStaticPartitions)
{
const PxgIslandContext& island = solverDesc->islandContextPool[islandIndex];
const IterativeSolveData& iterativeData = sharedDesc->iterativeData;
float4* PX_RESTRICT bodyVelocities = iterativeData.solverBodyVelPool;
float4* PX_RESTRICT bodyOutVelocities = iterativeData.tempStaticBodyOutputPool;
const PxU32 numDynamicBodies = island.mBodyCount; //nbBodies minus offset!
const PxU32 bodyOffset = island.mBodyStartIndex;
const PxU32 deltaVOffset = solverDesc->accumulatedBodyDeltaVOffset;
const PxU32 totalBodiesIncKinematics = numDynamicBodies + bodyOffset;
//This identifies which warp a specific thread is in
const uint globalThreadIdx = (threadIdx.x + blockIdx.x * blockDim.x);
PxU32 contactCount = 0, jointCount = 0;
if (globalThreadIdx < numDynamicBodies)
{
contactCount = solverDesc->mRigidStaticContactCounts[globalThreadIdx];
jointCount = solverDesc->mRigidStaticJointCounts[globalThreadIdx];
if (contactCount || jointCount)
{
const PxU32 maxOutputs = PxMax(contactCount, jointCount);
//We have velocity changes we need to propagate!
const PxU32 outputBody = deltaVOffset + globalThreadIdx + bodyOffset;
PxReal scale = 0.f;
float4 vel0 = make_float4(0.f);
float4 vel1 = make_float4(0.f);
for (PxU32 i = 0, index = globalThreadIdx; i < maxOutputs; i += maxStaticPartitions,
index += totalBodiesIncKinematics * 2, scale += 1.0f)
{
//We have velocity changes we need to propagate!
vel0 += bodyOutVelocities[index];
vel1 += bodyOutVelocities[index + totalBodiesIncKinematics];
}
scale = 1.f / scale;
/*printf("BodyOutVelocities[%i] = (%f, %f, %f, %f), bodyOutVelocities[%i] = (%f, %f, %f, %f), scale = %f\n",
globalThreadIdx + bodyOffset, vel0.x, vel0.y, vel0.z, vel0.w,
globalThreadIdx + bodyOffset + totalBodiesIncKinematics, vel1.x, vel1.y, vel1.z, vel1.w, scale);*/
bodyVelocities[outputBody] = vel0* scale;
bodyVelocities[outputBody + totalBodiesIncKinematics] = vel1* scale;
}
}
}