// Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved. // Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. // Copyright (c) 2001-2004 NovodeX AG. All rights reserved. #include "common/PxPhysXCommonConfig.h" #include #include #include "PxgSolverBody.h" #include "PxgSolverConstraintBlock1D.h" #include "PxgSolverConstraintDesc.h" #include "PxgConstraint.h" #include "PxgConstraintBlock.h" #include "PxgIslandContext.h" #include "PxgSolverContext.h" #include "cutil_math.h" #include "PxgSolverCoreDesc.h" #include "solverBlock.cuh" #include "PxgSolverKernelIndices.h" #include "PxgDynamicsConfiguration.h" #include "PxgIntrinsics.h" #include "stdio.h" #include "assert.h" #include "reduction.cuh" #include "solver.cuh" #include "PxgArticulationCoreDesc.h" using namespace physx; extern "C" __host__ void initSolverKernels6() {} PX_CUDA_CALLABLE PX_FORCE_INLINE PxU32 ComputeBodyBatchStartIndex(const PxU32 index) { //return 2*(index & (~31)) + (index&31); return index; } extern "C" __global__ void ZeroBodies(const PxgSolverCoreDesc* constraintPrepDesc, const PxgSolverSharedDesc* sharedDesc) { __shared__ float4* bodyVelocities; __shared__ float4* motionVelocities; __shared__ uint totalNumBodies; __shared__ uint totalNumBodiesConstraints; __shared__ uint offset; if(threadIdx.x == 0) { bodyVelocities = sharedDesc->iterativeData.solverBodyVelPool; motionVelocities = constraintPrepDesc->motionVelocityArray; totalNumBodies = constraintPrepDesc->numSolverBodies*2; totalNumBodiesConstraints = (constraintPrepDesc->numBatches + constraintPrepDesc->numArticBatches) * 32 * 2 *2; offset = constraintPrepDesc->accumulatedBodyDeltaVOffset; } const uint blockStride = (blockDim.x * gridDim.x); //This identifies which warp a specific thread is in const uint threadIndex = (threadIdx.x + blockIdx.x * blockDim.x); __syncthreads(); const float4 zero = make_float4(0.f); //(1) Set all motion velocities to zero. //(2) Set the delta velocities in the accumulation offset to zero for(uint a = threadIndex; a < totalNumBodies; a+=blockStride) { motionVelocities[a] = zero; bodyVelocities[offset + a] = zero; } //(2) Set all velocities to zero. Strictly, we only need to set the first instance of each body // in the solver to 0 but, for now, we'll just initialize them all to zero... for(uint a = threadIndex; a < totalNumBodiesConstraints; a+= blockStride) { bodyVelocities[a] = zero; } } __device__ __inline__ float4 loadFloat4(const float4* PX_RESTRICT address) { /*float4 ret; asm("ld.global.cg.v4.f32 {%0, %1, %2, %3}, [%4];" : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : "r"(address)); return ret;*/ return *address; } PX_FORCE_INLINE __device__ float4 cudeShuffle3(const PxU32 syncMask, bool condition, float4 reg0, float4 reg1, PxU32 shuffleMask) { float4 ret0, ret1; ret0.x = __shfl_sync(syncMask, reg0.x, shuffleMask); ret0.y = __shfl_sync(syncMask, reg0.y, shuffleMask); ret0.z = __shfl_sync(syncMask, reg0.z, shuffleMask); ret1.x = __shfl_sync(syncMask, reg1.x, shuffleMask); ret1.y = __shfl_sync(syncMask, reg1.y, shuffleMask); ret1.z = __shfl_sync(syncMask, reg1.z, shuffleMask); return condition ? ret0 : ret1; } // Marking active slabs loosely following "solveBlockPartition" static __device__ void markActiveSlab_rigidBodyPGS( const PxgSolverCoreDesc* PX_RESTRICT solverDesc, const PxgSolverSharedDesc* PX_RESTRICT sharedDesc, const PxU32 islandIndex, const PxU32 lastPartition) { const PxgIslandContext& island = solverDesc->islandContextPool[islandIndex]; const PxU32 startPartitionIndex = island.mStartPartitionIndex; const PxU32 startIndex = island.mBatchStartIndex; const PxU32 endIndex = solverDesc->constraintsPerPartition[lastPartition + startPartitionIndex]; const IterativeSolveData& iterativeData = sharedDesc->iterativeData; const PxU32 globalWarpIndex = blockIdx.x * blockDim.y + threadIdx.y; const PxU32 threadIndexInWarp = threadIdx.x; const PxU32 batchIndex = startIndex + globalWarpIndex; if (batchIndex < endIndex) { const PxgBlockConstraintBatch& batch = iterativeData.blockConstraintBatch[batchIndex]; if (threadIndexInWarp < batch.mDescStride) { const PxU32 bodyOffset = island.mBodyStartIndex; const PxU32 slabId = batch.slabId[threadIndexInWarp]; const PxU32 outputOffset = solverDesc->accumulatedBodyDeltaVOffset; //deltaVOffset const PxU32 numDynamicBodies = solverDesc->islandContextPool->mBodyCount; //nbBodies minus offset! const PxU32 totalBodiesIncKinematics = numDynamicBodies + bodyOffset; const PxU32 bodyIdA = batch.bodyAIndex[threadIndexInWarp]; const PxU32 bodyIdB = batch.bodyBIndex[threadIndexInWarp]; const PxU32 numArticulations = solverDesc->islandContextPool->mArticulationCount; const PxU32 numTotalBodies = bodyOffset + numDynamicBodies + numArticulations; const PxU32 slabIndexOffset = (slabId / 32) * numTotalBodies; const PxU32 encodedSlabIndex = (1u << (slabId % 32)); bool isActiveSlab = false; if (batch.constraintType == PxgSolverConstraintDesc::eCONSTRAINT_1D) { // For joint constraints, simply mark slabs as active for efficiency. // This does not significantly affect or increase the reference count in practice. isActiveSlab = true; } else { const PxU32 finalIdA = outputOffset + bodyIdA; const PxU32 finalIdB = outputOffset + bodyIdB; const float4 linVel0 = Pxldcg(iterativeData.solverBodyVelPool[finalIdA]); const float4 angVel0 = Pxldcg(iterativeData.solverBodyVelPool[finalIdA + totalBodiesIncKinematics]); const float4 linVel1 = Pxldcg(iterativeData.solverBodyVelPool[finalIdB]); const float4 angVel1 = Pxldcg(iterativeData.solverBodyVelPool[finalIdB + totalBodiesIncKinematics]); PxVec3 lv0(linVel0.x, linVel0.y, linVel0.z); PxVec3 lv1(linVel1.x, linVel1.y, linVel1.z); PxVec3 av0(angVel0.x, angVel0.y, angVel0.z); PxVec3 av1(angVel1.x, angVel1.y, angVel1.z); // Check if the contact/normal constraint is active. isActiveSlab = checkActiveContactBlock(batch, lv0, av0, lv1, av1, threadIndexInWarp, iterativeData.blockContactHeaders, iterativeData.blockContactPoints); } // Encode which slab is active in a 32-bit index. When querying reference counts, count the number of active // slabs encoded in solverEncodedReferenceCount. solverEncodedReferenceCount contains bitwise/slab-wise // activation information. if (isActiveSlab & (bodyIdA >= bodyOffset)) { atomicOr(&iterativeData.solverEncodedReferenceCount[slabIndexOffset + bodyIdA], encodedSlabIndex); } if (isActiveSlab & (bodyIdB >= bodyOffset)) { atomicOr(&iterativeData.solverEncodedReferenceCount[slabIndexOffset + bodyIdB], encodedSlabIndex); } } } } // Marking active slabs loosely following "artiSolveBlockPartition" static __device__ void markActiveSlab_articulationPGS( const PxgSolverCoreDesc* PX_RESTRICT solverDesc, const PxgSolverSharedDesc* PX_RESTRICT sharedDesc, const PxU32 islandIndex, const PxU32 lastPartition, const PxgArticulationCoreDesc* const PX_RESTRICT artiDesc) { const PxgIslandContext& island = solverDesc->islandContextPool[islandIndex]; const PxU32 startPartitionIndex = island.mStartPartitionIndex; const PxU32 startIndex = island.mArtiBatchStartIndex; const PxU32 articulationBatchOffset = solverDesc->islandContextPool->mBatchCount; const PxU32 endIndex = solverDesc->artiConstraintsPerPartition[lastPartition + startPartitionIndex] + articulationBatchOffset; const IterativeSolveData& iterativeData = sharedDesc->iterativeData; const PxU32 globalWarpIndex = blockIdx.x * blockDim.y + threadIdx.y; const PxU32 threadIndexInWarp = threadIdx.x; const PxU32 batchIndex = startIndex + globalWarpIndex + articulationBatchOffset; const PxU32 bodyOffset = island.mBodyStartIndex; const PxU32 numDynamicBodies = solverDesc->islandContextPool->mBodyCount; //nbBodies minus offset! if (batchIndex < endIndex) { const PxgBlockConstraintBatch& batch = iterativeData.blockConstraintBatch[batchIndex]; if (threadIndexInWarp < batch.mDescStride) { const PxNodeIndex igNodeIndexA = batch.bodyANodeIndex[threadIndexInWarp]; const PxNodeIndex igNodeIndexB = batch.bodyBNodeIndex[threadIndexInWarp]; const PxU32 slabId = batch.slabId[threadIndexInWarp]; const PxU32 nodeIndexA = igNodeIndexA.index(); const PxU32 nodeIndexB = igNodeIndexB.index(); const PxU32 bodyIdA = batch.bodyAIndex[threadIndexInWarp]; const PxU32 bodyIdB = batch.bodyBIndex[threadIndexInWarp]; PxU32 linkIndexA = igNodeIndexA.articulationLinkId(); PxU32 linkIndexB = igNodeIndexB.articulationLinkId(); Cm::UnAlignedSpatialVector vel0, vel1; const PxU32 numArticulations = solverDesc->islandContextPool->mArticulationCount; const PxU32 numTotalBodies = bodyOffset + numDynamicBodies + numArticulations; const PxU32 slabIndexOffset = (slabId / 32) * numTotalBodies; const PxU32 encodedSlabIndex = (1u << (slabId % 32)); bool isActiveSlab = false; const PxU32 constraintType = batch.constraintType; if (constraintType == PxgSolverConstraintDesc::eARTICULATION_CONSTRAINT_1D) // joint { // For joint constraints, simply mark slabs as active for efficiency. // This does not significantly affect or increase the reference count in practice. isActiveSlab = true; } else // contact { const PxU32 readIndex = batchIndex * 128 + threadIndexInWarp; const PxU32 outputOffset = solverDesc->accumulatedBodyDeltaVOffset; const PxU32 totalBodiesIncKinematics = numDynamicBodies + bodyOffset; Cm::UnAlignedSpatialVector vel0, vel1; if (igNodeIndexA.isArticulation()) { // For articulations, read velocities using readIndex as done in artiSolveBlockPartition. const float4 lin = Pxldcg(iterativeData.solverBodyVelPool[readIndex]); const float4 ang = Pxldcg(iterativeData.solverBodyVelPool[readIndex + 32]); vel0 = Cm::UnAlignedSpatialVector(PxVec3(ang.x, ang.y, ang.z), PxVec3(lin.x, lin.y, lin.z)); } else { // For rigid bodies, use the original rigid body velocity, not a slab velocity, in case velocities // at readIndex are not set. const PxU32 finalIdA = outputOffset + bodyIdA; const float4 lin = Pxldcg(iterativeData.solverBodyVelPool[finalIdA]); const float4 ang = Pxldcg(iterativeData.solverBodyVelPool[finalIdA + totalBodiesIncKinematics]); vel0 = Cm::UnAlignedSpatialVector(PxVec3(ang.x, ang.y, ang.z), PxVec3(lin.x, lin.y, lin.z)); } if (igNodeIndexB.isArticulation()) { // For articulations, read velocities using readIndex as done in artiSolveBlockPartition. const float4 lin = Pxldcg(iterativeData.solverBodyVelPool[readIndex + 64]); const float4 ang = Pxldcg(iterativeData.solverBodyVelPool[readIndex + 96]); vel1 = Cm::UnAlignedSpatialVector(PxVec3(ang.x, ang.y, ang.z), PxVec3(lin.x, lin.y, lin.z)); } else { // For rigid bodies, use the original rigid body velocity, not a slab velocity, in case velocities // at readIndex are not set. const PxU32 finalIdB = outputOffset + bodyIdB; const float4 lin = Pxldcg(iterativeData.solverBodyVelPool[finalIdB]); const float4 ang = Pxldcg(iterativeData.solverBodyVelPool[finalIdB + totalBodiesIncKinematics]); vel1 = Cm::UnAlignedSpatialVector(PxVec3(ang.x, ang.y, ang.z), PxVec3(lin.x, lin.y, lin.z)); } // Check if the contact/normal constraint is active. isActiveSlab = checkExtActiveContactBlock(batch, vel0, vel1, iterativeData.blockContactHeaders, iterativeData.blockContactPoints, iterativeData.artiResponse, threadIndexInWarp); } if (isActiveSlab) { // Encode which slab is active in a 32-bit index. When querying reference counts, count the number of // active slabs encoded in solverEncodedReferenceCount. solverEncodedReferenceCount contains // bitwise/slab-wise activation information. if (igNodeIndexA.isArticulation()) // articulation { const PxU32 articulationBodyIdA = batch.remappedBodyAIndex[threadIndexInWarp]; // Articulation IDs are at the back of rigid body IDs. const PxU32 globalBodyIdA = articulationBodyIdA + numDynamicBodies + bodyOffset; atomicOr(&iterativeData.solverEncodedReferenceCount[slabIndexOffset + globalBodyIdA], encodedSlabIndex); } else if (bodyIdA >= bodyOffset) // rigid { atomicOr(&iterativeData.solverEncodedReferenceCount[slabIndexOffset + bodyIdA], encodedSlabIndex); } if (igNodeIndexB.isArticulation()) // articulation { const PxU32 articulationBodyIdB = batch.remappedBodyBIndex[threadIndexInWarp]; // Articulation IDs are at the back of rigid body IDs. const PxU32 globalBodyIdB = articulationBodyIdB + numDynamicBodies + bodyOffset; atomicOr(&iterativeData.solverEncodedReferenceCount[slabIndexOffset + globalBodyIdB], encodedSlabIndex); } else if (bodyIdB >= bodyOffset) // rigid { atomicOr(&iterativeData.solverEncodedReferenceCount[slabIndexOffset + bodyIdB], encodedSlabIndex); } } } } } // Marking active rigid body slabs, loosely following "solveBlockPartition" and "artiSolveBlockPartition" extern "C" __global__ __launch_bounds__(PxgKernelBlockDim::SOLVE_BLOCK_PARTITION, 8) void markActiveSlabPGS(const PxgSolverCoreDesc * PX_RESTRICT solverDesc, const PxgSolverSharedDesc* PX_RESTRICT sharedDesc, const PxU32 islandIndex, const PxU32 lastPartition, const PxgArticulationCoreDesc* const PX_RESTRICT artiDesc) { if (blockIdx.y == 0) { markActiveSlab_rigidBodyPGS(solverDesc, sharedDesc, islandIndex, lastPartition); } else { markActiveSlab_articulationPGS(solverDesc, sharedDesc, islandIndex, lastPartition, artiDesc); } } extern "C" __global__ //__launch_bounds__(PxgKernelBlockDim::SOLVE_BLOCK_PARTITION, 16) void solveBlockPartition( PxgSolverCoreDesc* PX_RESTRICT solverDesc, const PxgSolverSharedDesc* PX_RESTRICT sharedDesc, const PxU32 islandIndex, const PxU32 partitionIndex, bool doFriction) { const PxgIslandContext& island = solverDesc->islandContextPool[islandIndex]; const PxU32 startPartitionIndex = island.mStartPartitionIndex; PxU32 startIndex = partitionIndex == 0 ? island.mBatchStartIndex : solverDesc->constraintsPerPartition[partitionIndex + startPartitionIndex - 1]; PxU32 endIndex = solverDesc->constraintsPerPartition[partitionIndex + startPartitionIndex]; const uint warpSize = 32; //This identifies which warp a specific thread is in const uint warpIndex = (threadIdx.x + blockIdx.x * blockDim.x)/warpSize; //This identifies which thread within a warp a specific thread is const uint threadIndexInWarp = threadIdx.x&(warpSize-1); __shared__ IterativeSolveData iterativeData; PxU32 idx = threadIdx.x; //if(threadIdx.x < sizeof(iterativeData)/sizeof(float)) while(idx < sizeof(iterativeData) / sizeof(float)) { float* iterData = reinterpret_cast(&iterativeData); iterData[idx] = reinterpret_cast(&sharedDesc->iterativeData)[idx]; idx += warpSize; } __syncthreads(); bool residualAccumulationEnabled = solverDesc->contactErrorAccumulator.mCounter >= 0; PxgErrorAccumulator error; //for(uint k = startIndex + warpIndex; k < endIndex; k+=blockStride) uint k = startIndex + warpIndex; if(k < endIndex) { assert(k < solverDesc->numBatches); const PxgBlockConstraintBatch& batch = iterativeData.blockConstraintBatch[k]; const PxU32 readIndex = k*128 + threadIndexInWarp; //Pull out shared memory into float4 format in registers to solve constraints if(threadIndexInWarp < batch.mDescStride) { //The linear/angular velocity pair for body 0 for threads 0 - 15, loaded by threads 0 - 31 in the format (b0.linVel, b0.angVel, b0.linVel, b0.angVel...) float4 linVel0 = Pxldcg(iterativeData.solverBodyVelPool[readIndex]); //The linear/angular velocity pair for body 0 for threads 16-31, loaded by threads 0 - 31 in the format (b0.linVel, b0.angVel, b0.linVel, b0.angVel...) float4 angVel0 = Pxldcg(iterativeData.solverBodyVelPool[readIndex + 32]); //The linear/angular velocity pair for body 1 for threads 0 - 15, loaded by threads 0 - 31 in the format (b1.linVel, b1.angVel, b1.linVel, b1.angVel...) float4 linVel1 = Pxldcg(iterativeData.solverBodyVelPool[readIndex + 64]); //The linear/angular velocity pair for body 1 for threads 16 - 31, loaded by threads 0 - 31 in the format (b1.linVel, b1.angVel, b1.linVel, b1.angVel...) float4 angVel1 = Pxldcg(iterativeData.solverBodyVelPool[readIndex + 96]); //printf("Rigid ReadIndex = %i, linVel0=(%f, %f, %f), angVel0=(%f, %f, %f)\n", readIndex, linVel0.x, linVel0.y, linVel0.z, angVel0.x, angVel0.y, angVel0.z); //printf("Rigid ReadIndex = %i, linVel0=(%f, %f, %f), angVel0=(%f, %f, %f)\n", readIndex+64, linVel1.x, linVel1.y, linVel1.z, angVel1.x, angVel1.y, angVel1.z); PxVec3 lv0(linVel0.x, linVel0.y, linVel0.z); PxVec3 lv1(linVel1.x, linVel1.y, linVel1.z); PxVec3 av0(angVel0.x, angVel0.y, angVel0.z); PxVec3 av1(angVel1.x, angVel1.y, angVel1.z); // Reference counts to be used for the current sub-timestep or iteration. PxReal curRef0 = 1.0f; PxReal curRef1 = 1.0f; const PxU32 bodyOffset = solverDesc->islandContextPool->mBodyStartIndex; const PxU32 numDynamicBodies = solverDesc->islandContextPool->mBodyCount; //nbBodies minus offset! const PxU32 numArticulations = solverDesc->islandContextPool->mArticulationCount; const PxU32 numTotalBodies = bodyOffset + numDynamicBodies + numArticulations; const PxU32* const PX_RESTRICT encodedReferenceCount = sharedDesc->iterativeData.solverEncodedReferenceCount; if(batch.bodyAIndex[threadIndexInWarp] >= bodyOffset) { // Counting the number of active slabs curRef0 = static_cast( countActiveSlabs(batch.bodyAIndex[threadIndexInWarp], solverDesc->numSlabs, numTotalBodies, encodedReferenceCount)); } if(batch.bodyBIndex[threadIndexInWarp] >= bodyOffset) { // Counting the number of active slabs curRef1 = static_cast( countActiveSlabs(batch.bodyBIndex[threadIndexInWarp], solverDesc->numSlabs, numTotalBodies, encodedReferenceCount)); } if (batch.constraintType == PxgSolverConstraintDesc::eCONTACT) solveContactBlock(batch, lv0, av0, lv1, av1, doFriction, threadIndexInWarp, iterativeData.blockContactHeaders, iterativeData.blockFrictionHeaders, iterativeData.blockContactPoints, iterativeData.blockFrictions, residualAccumulationEnabled ? &error : NULL, curRef0, curRef1); else solve1DBlock(batch, lv0, av0, lv1, av1, threadIndexInWarp, iterativeData.blockJointConstraintHeaders, iterativeData.blockJointConstraintRowsCon, iterativeData.blockJointConstraintRowsMod, solverDesc->contactErrorAccumulator.mCounter >= 0, curRef0, curRef1); const PxU32 remapA = batch.remappedBodyAIndex[threadIndexInWarp]; const PxU32 remapB = batch.remappedBodyBIndex[threadIndexInWarp]; const PxU32 indexA = ComputeBodyBatchStartIndex(remapA); const PxU32 indexB = ComputeBodyBatchStartIndex(remapB); Pxstcg(&iterativeData.solverBodyVelPool[indexA], make_float4(lv0.x, lv0.y, lv0.z, 0.f)); Pxstcg(&iterativeData.solverBodyVelPool[indexA + 32], make_float4(av0.x, av0.y, av0.z, 0.f)); Pxstcg(&iterativeData.solverBodyVelPool[indexB], make_float4(lv1.x, lv1.y, lv1.z, 0.f)); Pxstcg(&iterativeData.solverBodyVelPool[indexB + 32], make_float4(av1.x, av1.y, av1.z, 0.f)); } //KS - Even threads in a warp output the linear velocities, odd threads output the angular velocities. //We output for threadIndexInWarp/2 and 16 + threadIndexInWarp/2 #if 0 const PxU32 firstWarpOutput = threadIndexInWarp/2; const PxU32 secondWarpOutput = threadIndexInWarp/2 + 16; float4 indexAOutput0 = cudeShuffle3(threadIndexInWarp & 1, angVel0, linVel0, firstWarpOutput); float4 indexBOutput0 = cudeShuffle3(threadIndexInWarp & 1, angVel1, linVel1, firstWarpOutput); float4 indexAOutput1 = cudeShuffle3(threadIndexInWarp & 1, angVel0, linVel0, secondWarpOutput); float4 indexBOutput1 = cudeShuffle3(threadIndexInWarp & 1, angVel1, linVel1, secondWarpOutput); if(firstWarpOutput < batch.mDescStride) { const PxU32 indexA = (2*batch.remappedBodyAIndex[firstWarpOutput] + (threadIdx.x&1)); const PxU32 indexB = (2*batch.remappedBodyBIndex[firstWarpOutput] + (threadIdx.x&1)); iterativeData.solverBodyVelPool[indexA] = indexAOutput0; iterativeData.solverBodyVelPool[indexB] = indexBOutput0; } if(secondWarpOutput < batch.mDescStride) { const PxU32 indexA = (2*batch.remappedBodyAIndex[secondWarpOutput] + (threadIdx.x&1)); const PxU32 indexB = (2*batch.remappedBodyBIndex[secondWarpOutput] + (threadIdx.x&1)); iterativeData.solverBodyVelPool[indexA] = indexAOutput1; iterativeData.solverBodyVelPool[indexB] = indexBOutput1; } #else #endif } if (residualAccumulationEnabled) { error.accumulateErrorGlobalFullWarp(solverDesc->contactErrorAccumulator, threadIndexInWarp); } } extern "C" __global__ void writebackBlocks( PxgSolverCoreDesc* constraintPrepDesc, PxgSolverSharedDesc* sharedDesc, const PxU32 islandIndex) { PxgIslandContext& island = constraintPrepDesc->islandContextPool[islandIndex]; const PxU32 startIndex = island.mBatchStartIndex; const PxU32 endIndex = island.mBatchCount + island.mArtiBatchCount + island.mStaticArtiBatchCount + island.mSelfArtiBatchCount + island.mStaticRigidBatchCount + startIndex; const uint warpSize = 32; //const uint blockStride = (blockDim.x * gridDim.x)/warpSize; //This identifies which warp a specific thread is in const uint warpIndex = (threadIdx.x + blockIdx.x * blockDim.x)/warpSize; const uint warpIndexInBlock = threadIdx.x/warpSize; //This identifies which thread within a warp a specific thread is const uint threadIndexInWarp = threadIdx.x&(warpSize-1); PxgBlockConstraintBatch* batchHeaders = sharedDesc->iterativeData.blockConstraintBatch; PxgBlockSolverConstraint1DHeader* jointHeaders = sharedDesc->iterativeData.blockJointConstraintHeaders; PxgBlockSolverConstraint1DCon* jointRowsCon = sharedDesc->iterativeData.blockJointConstraintRowsCon; PxgBlockSolverConstraint1DMod* jointRowsMod = sharedDesc->iterativeData.blockJointConstraintRowsMod; PxgConstraintWriteback* constraintWriteBack = constraintPrepDesc->constraintWriteBack; PxgSolverBodyData* solverBodyDatas = constraintPrepDesc->solverBodyDataPool; PxgBlockSolverContactHeader* contactHeaders = sharedDesc->iterativeData.blockContactHeaders; PxgBlockSolverFrictionHeader* frictionHeaders = sharedDesc->iterativeData.blockFrictionHeaders; PxgBlockSolverContactPoint* contactPoints = sharedDesc->iterativeData.blockContactPoints; PxgBlockSolverContactFriction* frictions = sharedDesc->iterativeData.blockFrictions; PxgBlockFrictionPatch* baseFrictionPatches = sharedDesc->blockCurrentFrictionPatches; PxF32* baseWritebackForceBuffer = constraintPrepDesc->forceBuffer; PxgFrictionPatchGPU* frictionPatches = reinterpret_cast(constraintPrepDesc->frictionPatches); //__shared__ Dy::ThresholdStreamElement elems[PxgKernelBlockDim::WRITEBACK_BLOCKS]; __shared__ PxU8 elemsMem[sizeof(Dy::ThresholdStreamElement)*PxgKernelBlockDim::WRITEBACK_BLOCKS]; Dy::ThresholdStreamElement* elems = reinterpret_cast(elemsMem); __shared__ PxI32 index[PxgKernelBlockDim::WRITEBACK_BLOCKS/warpSize]; Dy::ThresholdStreamElement* startAddress = &elems[32*warpIndexInBlock]; //for(uint k = startIndex + warpIndex; k < endIndex; k+=blockStride) uint k = startIndex + warpIndex; if(k < endIndex) { if(threadIndexInWarp == 0) index[warpIndexInBlock] = 0; __syncwarp(); //Get block header const PxgBlockConstraintBatch& batch = batchHeaders[k]; if(threadIndexInWarp < batch.mDescStride) { if(batch.constraintType==PxgSolverConstraintDesc::eCONTACT || batch.constraintType == PxgSolverConstraintDesc::eARTICULATION_CONTACT) { writeBackContactBlock(batch, threadIndexInWarp, solverBodyDatas, startAddress, &index[warpIndexInBlock], contactHeaders, frictionHeaders, contactPoints, frictions, baseWritebackForceBuffer, baseFrictionPatches[batch.mConstraintBatchIndex], frictionPatches); } else { //Do nothing (for now) writeBack1DBlock(batch, threadIndexInWarp, jointHeaders, jointRowsCon, jointRowsMod, constraintWriteBack); } } //__syncthreads(); __syncwarp(); PxI32 ind = index[warpIndexInBlock]; if(ind > 0) { __shared__ PxI32 startIndex[PxgKernelBlockDim::WRITEBACK_BLOCKS/warpSize]; if(threadIndexInWarp == 0) { startIndex[warpIndexInBlock] = atomicAdd(&constraintPrepDesc->sharedThresholdStreamIndex, ind); } __syncwarp(); if(threadIndexInWarp < ind) { //((float4*)constraintPrepDesc->thresholdStream)[startIndex[warpIndexInBlock]+threadIndexInWarp] = ((float4*)startAddress)[threadIndexInWarp]; constraintPrepDesc->thresholdStream[startIndex[warpIndexInBlock]+threadIndexInWarp] = startAddress[threadIndexInWarp]; } } } } extern "C" __global__ void concludeBlocks( PxgSolverCoreDesc* constraintPrepDesc, PxgSolverSharedDesc* sharedDesc, const PxU32 islandIndex) { PxgIslandContext& island = constraintPrepDesc->islandContextPool[islandIndex]; const PxU32 startIndex = island.mBatchStartIndex; const PxU32 endIndex = island.mBatchCount + island.mArtiBatchCount + island.mStaticArtiBatchCount + island.mSelfArtiBatchCount + island.mStaticRigidBatchCount + startIndex; const uint warpSize = 32; //const uint blockStride = (blockDim.x * gridDim.x)/warpSize; //This identifies which warp a specific thread is in const uint warpIndex = (threadIdx.x + blockIdx.x * blockDim.x)/warpSize; //This identifies which thread within a warp a specific thread is const uint threadIndexInWarp = threadIdx.x&(warpSize-1); PxgBlockConstraintBatch* batchHeaders = sharedDesc->iterativeData.blockConstraintBatch; PxgBlockSolverConstraint1DHeader* jointHeaders = sharedDesc->iterativeData.blockJointConstraintHeaders; PxgBlockSolverConstraint1DMod* jointRowsMod = sharedDesc->iterativeData.blockJointConstraintRowsMod; PxgBlockSolverContactHeader* contactHeaders = sharedDesc->iterativeData.blockContactHeaders; PxgBlockSolverFrictionHeader* frictionHeaders = sharedDesc->iterativeData.blockFrictionHeaders; PxgBlockSolverContactPoint* contactPoints = sharedDesc->iterativeData.blockContactPoints; PxgBlockSolverContactFriction* frictions = sharedDesc->iterativeData.blockFrictions; //for(uint k = startIndex + warpIndex; k < endIndex; k+=blockStride) uint k = startIndex + warpIndex; if(k < endIndex) { //Get block header const PxgBlockConstraintBatch& batch = batchHeaders[k]; if(threadIndexInWarp < batch.mDescStride) { if(batch.constraintType==PxgSolverConstraintDesc::eCONTACT || batch.constraintType == PxgSolverConstraintDesc::eARTICULATION_CONTACT) { concludeContactBlock(batch, threadIndexInWarp, contactHeaders, frictionHeaders, contactPoints, frictions); } else { conclude1DBlock(batch, threadIndexInWarp, jointHeaders, jointRowsMod); } } } } extern "C" __global__ void writeBackBodies( const PxgSolverCoreDesc* constraintPrepDesc, const PxgSolverSharedDesc* sharedDesc, const PxU32 islandIndex) { __shared__ float4* bodyVelocities; __shared__ float4* motionVelocities; __shared__ PxU32 bodyStartIndex; __shared__ PxU32 bodyEndIndex; __shared__ PxU32 totalBodyCount; __shared__ PxU32 outputOffsetIndex; if(threadIdx.x == 0) { PxgIslandContext& island = constraintPrepDesc->islandContextPool[islandIndex]; bodyVelocities = sharedDesc->iterativeData.solverBodyVelPool; motionVelocities = constraintPrepDesc->motionVelocityArray; bodyStartIndex = island.mBodyStartIndex; totalBodyCount = constraintPrepDesc->numSolverBodies; bodyEndIndex = totalBodyCount; outputOffsetIndex = sharedDesc->deltaOutOffset; } __syncthreads(); //const uint blockStride = (blockDim.x * gridDim.x); //This identifies which warp a specific thread is in const uint threadIndex = (threadIdx.x + blockIdx.x * blockDim.x); //for(uint a = threadIndex+bodyStartIndex; a < bodyEndIndex; a+=blockStride) uint a = threadIndex+bodyStartIndex; if(a < bodyEndIndex) { motionVelocities[a] = bodyVelocities[a + outputOffsetIndex]; //Linear velocity motionVelocities[a+totalBodyCount] = bodyVelocities[a + outputOffsetIndex + totalBodyCount]; //Angular velocity stored after all linear velocities } } extern "C" __global__ void computeAverageSolverBodyVelocity( const PxgSolverCoreDesc* const PX_RESTRICT solverDesc, const PxgSolverSharedDesc* PX_RESTRICT sharedDesc) { //we need to fill in the writeIndex for the solver body data const PxgSolverReferences* const PX_RESTRICT solverReferences = solverDesc->solverBodyReferences; float4* bodyVelocities = sharedDesc->iterativeData.solverBodyVelPool; PxU32* encodedReferenceCount = sharedDesc->iterativeData.solverEncodedReferenceCount; const PxU32 totalBodyCount = solverDesc->islandContextPool->mBodyCount; //nbBodies minus offset! const PxU32 bodyOffset = solverDesc->islandContextPool->mBodyStartIndex; const PxU32 numSlabs = solverDesc->numSlabs; const PxU32 numBatches = solverDesc->numBatches; const PxU32 numArticBatches = solverDesc->numArticBatches; const PxU32 deltaVOffset = solverDesc->accumulatedBodyDeltaVOffset; const PxU32 averageOutputOffsets = (numBatches + numArticBatches) * PXG_BATCH_SIZE * 2 * 2; const PxU32 numThreadsPerBody = PxMin(isPowerOfTwo(numSlabs) ? numSlabs : nextPowerOfTwo(numSlabs), 32u); const PxU32 numBodiesPerWarp = WARP_SIZE / numThreadsPerBody; const uint warpIndex = (blockIdx.x * blockDim.y + threadIdx.y); const uint warpBodyStartIndex = warpIndex * numBodiesPerWarp; const PxU32 threadIdInWorkUnit = threadIdx.x & (numThreadsPerBody - 1); const PxU32 subWarpIndex = threadIdx.x / numThreadsPerBody; const PxU32 bodyInWarp = threadIdx.x / numThreadsPerBody; const PxU32 maskHigh = ((1 << ((subWarpIndex + 1) * numThreadsPerBody)) - 1); const PxU32 maskLow = ((1 << ((subWarpIndex)*numThreadsPerBody)) - 1); const PxU32 mask = maskHigh - maskLow; const uint bodyId = warpBodyStartIndex + bodyInWarp; if (bodyId < totalBodyCount) { float4 linDelta = make_float4(0.f); float4 angDelta = make_float4(0.f); const PxU32 outputBody = deltaVOffset + bodyId + bodyOffset; //Store linear and angular velocity!!! const float4 lastLinearVel = bodyVelocities[outputBody]; const float4 lastAngularVel = bodyVelocities[outputBody + totalBodyCount + bodyOffset]; bool hasRef = false; //get out velocity for the correponding body in a slab and put on weight to calculate the final velocity for that body for (uint b = threadIdInWorkUnit; b < numSlabs; b += numThreadsPerBody) { const uint bodyIndex = bodyId * numSlabs + b; PxgSolverReferences reference = solverReferences[bodyIndex]; PxU32 remappedBodyIndex = reference.mRemappedBodyIndex; if (remappedBodyIndex != 0xFFFFFFFF) { //Outputs are grouped into little clusters of 32... const uint velIndex = averageOutputOffsets + ComputeAverageBodyBatchStartIndex(bodyIndex); const float4 curLinearDeltaV = bodyVelocities[velIndex]; const float4 curAngularDeltaV = bodyVelocities[velIndex + 32]; float4 lDelta = (curLinearDeltaV - lastLinearVel); float4 aDelta = (curAngularDeltaV - lastAngularVel); { linDelta += lDelta; angDelta += aDelta; } hasRef = true; } } //Now do the reduction... #pragma unroll for (PxU32 reductionRadius = numThreadsPerBody >> 1; reductionRadius > 0; reductionRadius >>= 1) { linDelta.x += __shfl_xor_sync(mask, linDelta.x, reductionRadius, numThreadsPerBody); linDelta.y += __shfl_xor_sync(mask, linDelta.y, reductionRadius, numThreadsPerBody); linDelta.z += __shfl_xor_sync(mask, linDelta.z, reductionRadius, numThreadsPerBody); angDelta.x += __shfl_xor_sync(mask, angDelta.x, reductionRadius, numThreadsPerBody); angDelta.y += __shfl_xor_sync(mask, angDelta.y, reductionRadius, numThreadsPerBody); angDelta.z += __shfl_xor_sync(mask, angDelta.z, reductionRadius, numThreadsPerBody); } hasRef = (__ballot_sync(mask, hasRef) & mask); if (hasRef) { if (threadIdInWorkUnit == 0) { // Counting the number of active slabs const PxU32 numDynamicBodies = solverDesc->islandContextPool->mBodyCount; //nbBodies minus offset! const PxU32 numArticulations = solverDesc->islandContextPool->mArticulationCount; const PxU32 numTotalBodies = bodyOffset + numDynamicBodies + numArticulations; const PxU32 referenceCount = countActiveSlabs(bodyOffset + bodyId, solverDesc->numSlabs, numTotalBodies, encodedReferenceCount); const PxReal recipRefs = 1.f / static_cast(referenceCount); linDelta = linDelta * recipRefs; angDelta = angDelta * recipRefs; // Resetting rigid body reference count resetSlabCount(bodyOffset + bodyId, solverDesc->numSlabs, numTotalBodies, encodedReferenceCount); //Store linear and angular velocity!!! bodyVelocities[outputBody] = lastLinearVel + linDelta; bodyVelocities[outputBody + totalBodyCount + bodyOffset] = lastAngularVel + angDelta; } } } } extern "C" __global__ void propagateSolverBodyVelocity( const PxgSolverCoreDesc* solverDesc, const PxgSolverSharedDesc* sharedDesc) { //we need to fill in the writeIndex for the solver body data PxgSolverReferences* solverReferences = solverDesc->solverBodyReferences; float4* bodyVelocities = sharedDesc->iterativeData.solverBodyVelPool; const PxU32 totalBodyCount = solverDesc->islandContextPool->mBodyCount; //nbBodies minus offset! const PxU32 bodyOffset = solverDesc->islandContextPool->mBodyStartIndex; //nbBodies minus offset! const PxU32 numSlabs = solverDesc->numSlabs; const PxU32 deltaVOffset = solverDesc->accumulatedBodyDeltaVOffset; //This identifies which warp a specific thread is in const uint threadIndex = (threadIdx.x + blockIdx.x * blockDim.x); const PxU32 numThreadsPerBody = PxMin(isPowerOfTwo(numSlabs) ? numSlabs : nextPowerOfTwo(numSlabs), 32u); const PxU32 numBodiesPerWarp = WARP_SIZE / numThreadsPerBody; const PxU32 warpIndex = threadIndex / WARP_SIZE; const uint warpBodyStartIndex = warpIndex * numBodiesPerWarp; const PxU32 threadIdInWorkUnit = threadIdx.x&(numThreadsPerBody - 1); const PxU32 bodyInWarp = (threadIdx.x&31) / numThreadsPerBody; const uint bodyId = warpBodyStartIndex + bodyInWarp; if(bodyId < totalBodyCount) { const PxU32 outputBody = deltaVOffset + bodyId + bodyOffset; //Store linear and angular velocity!!! float4 linVel = bodyVelocities[outputBody]; float4 angVel = bodyVelocities[outputBody + totalBodyCount + bodyOffset]; linVel.w = 0.f; for(uint b=threadIdInWorkUnit; bforceChangeThresholdElements; PxU32 nbElemsChanges = solverDesc->nbForceChangeElements; PxU32 nbThreadsRequired = (sizeof(Dy::ThresholdStreamElement) * nbElemsChanges)/sizeof(PxU32); PxU32* src = reinterpret_cast(changeElems); PxU32* dst = reinterpret_cast(hostChangedElems); PxU32 globalThreadIdx = threadIdx.x + blockIdx.x * blockDim.x; for(PxU32 i = globalThreadIdx; i < nbThreadsRequired; i+= blockDim.x * gridDim.x) { dst[i] = src[i]; } } extern "C" __global__ void dmaConstraintResidual(const PxgConstraintWriteback* writebacks, PxReal* residuals, PxU32 count) { PxU32 globalThreadIdx = threadIdx.x + blockIdx.x * blockDim.x; if (globalThreadIdx < count) { residuals[globalThreadIdx] = writebacks[globalThreadIdx].angularImpulse_residual.w; } } extern "C" __global__ //__launch_bounds__(PxgKernelBlockDim::SOLVE_BLOCK_PARTITION, 16) void solveStaticBlock( PxgSolverCoreDesc* PX_RESTRICT solverDesc, const PxgSolverSharedDesc* PX_RESTRICT sharedDesc, const PxU32 islandIndex, const PxU32 nbStaticSlabs, const PxU32 maxStaticPartitions, bool doFriction) { const PxgIslandContext& island = solverDesc->islandContextPool[islandIndex]; const IterativeSolveData& iterativeData = sharedDesc->iterativeData; float4* PX_RESTRICT bodyVelocities = iterativeData.solverBodyVelPool; float4* PX_RESTRICT bodyOutVelocities = iterativeData.tempStaticBodyOutputPool; const PxU32 numDynamicBodies = island.mBodyCount; //nbBodies minus offset! const PxU32 bodyOffset = island.mBodyStartIndex; const PxU32 deltaVOffset = solverDesc->accumulatedBodyDeltaVOffset; const PxU32 totalBodiesIncKinematics = numDynamicBodies + bodyOffset; const PxU32 numDynamicBodiesRounded = (island.mBodyCount + 31)&(~31); //Rounded to a multiple of 32 const uint warpSize = 32; //This identifies which warp a specific thread is in const uint globalThreadIdx = (threadIdx.x + blockIdx.x * blockDim.x); //This identifies which thread within a warp a specific thread is const uint threadIndexInWarp = threadIdx.x&(warpSize - 1); const PxU32 nbDynamicBodiesToSolve = numDynamicBodiesRounded * nbStaticSlabs; const uint bodyIndex = globalThreadIdx % numDynamicBodiesRounded; const PxU32 slabIdx = (globalThreadIdx / numDynamicBodiesRounded); const PxU32 startIndex = slabIdx * maxStaticPartitions; bool residualAccumulationEnabled = solverDesc->contactErrorAccumulator.mCounter >= 0; PxgErrorAccumulator error; if (globalThreadIdx < nbDynamicBodiesToSolve) { if (bodyIndex < numDynamicBodies) { PxU32 contactCount = PxMin(maxStaticPartitions, PxMax(startIndex, solverDesc->mRigidStaticContactCounts[bodyIndex]) - startIndex); PxU32 jointCount = PxMin(maxStaticPartitions, PxMax(startIndex, solverDesc->mRigidStaticJointCounts[bodyIndex]) - startIndex); const PxU32 outputBody = slabIdx * totalBodiesIncKinematics * 2; //printf("BodyIndex = %i, globalThreadIdx = %i, nbDynamicBodiesToSolve = %i, nbStaticSlabs = %i, contactCount = %i, startIndex = %i, outputBody = %i\n", bodyIndex, globalThreadIdx, // nbDynamicBodiesToSolve, nbStaticSlabs, contactCount, startIndex, outputBody); if (contactCount != 0 || jointCount != 0) { //We have some constraints to solve... const PxU32 startContactIndex = solverDesc->mRigidStaticContactStartIndices[bodyIndex] + startIndex; const PxU32 startJointIndex = solverDesc->mRigidStaticJointStartIndices[bodyIndex] + startIndex; assert(startContactIndex >= solverDesc->numBatches); const PxU32 inputBody = deltaVOffset + bodyIndex + bodyOffset; //Load in velocity data... float4 linVel = bodyVelocities[inputBody]; float4 angVel = bodyVelocities[inputBody + totalBodiesIncKinematics]; PxVec3 lv0(linVel.x, linVel.y, linVel.z); PxVec3 lv1(0.f); PxVec3 av0(angVel.x, angVel.y, angVel.z); PxVec3 av1(0.f); for (PxU32 i = 0; i < jointCount; ++i) { const PxgBlockConstraintBatch& batch = iterativeData.blockConstraintBatch[startJointIndex + i]; assert(batch.constraintType == PxgSolverConstraintDesc::eCONSTRAINT_1D); PxU32 idx = warpScanExclusive(batch.mask, threadIndexInWarp); // For interaction with static objects, mass-splitting is not used; thus, reference counts are 1. solve1DBlock(batch, lv0, av0, lv1, av1, idx, iterativeData.blockJointConstraintHeaders, iterativeData.blockJointConstraintRowsCon, iterativeData.blockJointConstraintRowsMod, solverDesc->contactErrorAccumulator.mCounter >= 0, 1.f, 1.f); } for (PxU32 i = 0; i < contactCount; ++i) { const PxgBlockConstraintBatch& batch = iterativeData.blockConstraintBatch[startContactIndex + i]; assert(batch.constraintType == PxgSolverConstraintDesc::eCONTACT); PxU32 idx = warpScanExclusive(batch.mask, threadIndexInWarp); // For interaction with static objects, mass-splitting is not used; thus, reference counts are 1. solveContactBlock(batch, lv0, av0, lv1, av1, doFriction, idx, iterativeData.blockContactHeaders, iterativeData.blockFrictionHeaders, iterativeData.blockContactPoints, iterativeData.blockFrictions, residualAccumulationEnabled ? &error : NULL, 1.f, 1.f); } //if (globalThreadIdx == 33) ////if(startContactIndex == (solverDesc->numBatches)) ////if(warpScanExclusive(iterativeData.blockConstraintBatch[startContactIndex].mask, threadIndexInWarp) == 0) //{ // printf("%i: NumContacts = %i, beforeVel = (%f, %f, %f), afterlinVel = (%f, %f, %f), lv1 (%f, %f, %f), av1(%f, %f, %f), startContactIndex = %i, numBatches = %i\n", // globalThreadIdx, contactCount, linVel.x, linVel.y, linVel.z, lv0.x, lv0.y, lv0.z, lv1.x, lv1.y, lv1.z, av1.x, av1.y, av1.z, startContactIndex, solverDesc->numBatches); //} linVel.x = lv0.x; linVel.y = lv0.y; linVel.z = lv0.z; angVel.x = av0.x; angVel.y = av0.y; angVel.z = av0.z; /*printf("%i: BodyOutVelocities[%i] = (%f, %f, %f, %f), bodyOutVelocities[%i] = (%f, %f, %f, %f)\n", globalThreadIdx, bodyIndex + outputBody, linVel.x, linVel.y, linVel.z, linVel.w, bodyIndex + outputBody + totalBodiesIncKinematics, angVel.x, angVel.y, angVel.z, angVel.w);*/ bodyOutVelocities[bodyIndex + outputBody] = linVel; bodyOutVelocities[bodyIndex + outputBody + totalBodiesIncKinematics] = angVel; } } } if (residualAccumulationEnabled) { error.accumulateErrorGlobalFullWarp(solverDesc->contactErrorAccumulator, threadIndexInWarp); } } extern "C" __global__ //__launch_bounds__(PxgKernelBlockDim::SOLVE_BLOCK_PARTITION, 16) void propagateStaticSolverBodyVelocities( const PxgSolverCoreDesc* PX_RESTRICT solverDesc, const PxgSolverSharedDesc* PX_RESTRICT sharedDesc, const PxU32 islandIndex, const PxU32 nbStaticSlabs, const PxU32 maxStaticPartitions) { const PxgIslandContext& island = solverDesc->islandContextPool[islandIndex]; const IterativeSolveData& iterativeData = sharedDesc->iterativeData; float4* PX_RESTRICT bodyVelocities = iterativeData.solverBodyVelPool; float4* PX_RESTRICT bodyOutVelocities = iterativeData.tempStaticBodyOutputPool; const PxU32 numDynamicBodies = island.mBodyCount; //nbBodies minus offset! const PxU32 bodyOffset = island.mBodyStartIndex; const PxU32 deltaVOffset = solverDesc->accumulatedBodyDeltaVOffset; const PxU32 totalBodiesIncKinematics = numDynamicBodies + bodyOffset; //This identifies which warp a specific thread is in const uint globalThreadIdx = (threadIdx.x + blockIdx.x * blockDim.x); PxU32 contactCount = 0, jointCount = 0; if (globalThreadIdx < numDynamicBodies) { contactCount = solverDesc->mRigidStaticContactCounts[globalThreadIdx]; jointCount = solverDesc->mRigidStaticJointCounts[globalThreadIdx]; if (contactCount || jointCount) { const PxU32 maxOutputs = PxMax(contactCount, jointCount); //We have velocity changes we need to propagate! const PxU32 outputBody = deltaVOffset + globalThreadIdx + bodyOffset; PxReal scale = 0.f; float4 vel0 = make_float4(0.f); float4 vel1 = make_float4(0.f); for (PxU32 i = 0, index = globalThreadIdx; i < maxOutputs; i += maxStaticPartitions, index += totalBodiesIncKinematics * 2, scale += 1.0f) { //We have velocity changes we need to propagate! vel0 += bodyOutVelocities[index]; vel1 += bodyOutVelocities[index + totalBodiesIncKinematics]; } scale = 1.f / scale; /*printf("BodyOutVelocities[%i] = (%f, %f, %f, %f), bodyOutVelocities[%i] = (%f, %f, %f, %f), scale = %f\n", globalThreadIdx + bodyOffset, vel0.x, vel0.y, vel0.z, vel0.w, globalThreadIdx + bodyOffset + totalBodiesIncKinematics, vel1.x, vel1.y, vel1.z, vel1.w, scale);*/ bodyVelocities[outputBody] = vel0* scale; bodyVelocities[outputBody + totalBodiesIncKinematics] = vel1* scale; } } }