849 lines
31 KiB
Plaintext
849 lines
31 KiB
Plaintext
// Redistribution and use in source and binary forms, with or without
|
|
// modification, are permitted provided that the following conditions
|
|
// are met:
|
|
// * Redistributions of source code must retain the above copyright
|
|
// notice, this list of conditions and the following disclaimer.
|
|
// * Redistributions in binary form must reproduce the above copyright
|
|
// notice, this list of conditions and the following disclaimer in the
|
|
// documentation and/or other materials provided with the distribution.
|
|
// * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
// contributors may be used to endorse or promote products derived
|
|
// from this software without specific prior written permission.
|
|
//
|
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
|
|
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
//
|
|
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
|
|
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
|
|
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
|
|
|
|
#include "foundation/PxPreprocessor.h"
|
|
|
|
#include "foundation/PxSimpleTypes.h"
|
|
#include "PxgSolverCoreDesc.h"
|
|
#include "PxgRadixSortDesc.h"
|
|
#include "DyThresholdTable.h"
|
|
#include "RadixSort.cuh"
|
|
#include "PxgRadixSortDesc.h"
|
|
#include "PxgCommonDefines.h"
|
|
|
|
#include "reduction.cuh"
|
|
#include <stdio.h>
|
|
#include "PxgSolverKernelIndices.h"
|
|
|
|
using namespace physx;
|
|
|
|
extern "C" __host__ void initSolverKernels0() {}
|
|
|
|
extern "C" __global__ void bodyInputAndRanksSingleBlockLaunch(const PxgSolverCoreDesc* solverDesc, const PxgRadixSortDesc* desc, const PxU32 gStartBit)
|
|
{
|
|
uint4* gInputKeys = reinterpret_cast<uint4*>(desc->inputKeys);
|
|
uint4* gInputRanks = reinterpret_cast<uint4*>(desc->inputRanks);
|
|
PxU32* gRadixCount = desc->radixBlockCounts;
|
|
const PxU32 gNumOfKeys = solverDesc->sharedThresholdStreamIndex;
|
|
|
|
radixSortSingleBlock<PxgKernelBlockDim::RADIXSORT/WARP_SIZE>(gInputKeys, gInputRanks, gNumOfKeys, gStartBit, gRadixCount);
|
|
}
|
|
|
|
extern "C" __global__ void bodyInputAndRanksBlocksLaunch(const PxgSolverCoreDesc* solverDesc, PxgRadixSortDesc* desc, const PxU32 gStartBit)
|
|
{
|
|
uint4* gInputKeys = reinterpret_cast<uint4*>(desc->inputKeys);
|
|
uint4* gInputRanks = reinterpret_cast<uint4*>(desc->inputRanks);
|
|
PxU32* gOutputKeys = desc->outputKeys;
|
|
PxU32* gOutputRanks = desc->outputRanks;
|
|
|
|
PxU32* gRadixCount = desc->radixBlockCounts;
|
|
const PxU32 gNumOfKeys = solverDesc->sharedThresholdStreamIndex;
|
|
|
|
radixSortCalculateRanks<PxgKernelBlockDim::RADIXSORT / WARP_SIZE>( gInputKeys, gInputRanks, gNumOfKeys, gStartBit, gRadixCount, gOutputKeys, gOutputRanks);
|
|
}
|
|
|
|
extern "C" __global__ void initialRanksAndBodyIndexB(const PxgSolverCoreDesc* solverDesc, const PxgRadixSortDesc* rsDesc)
|
|
{
|
|
Dy::ThresholdStreamElement* thresholdStream = solverDesc->thresholdStream;
|
|
const PxU32 nbThresholdElements = solverDesc->sharedThresholdStreamIndex;
|
|
|
|
PxU32* gInputKeys = rsDesc->inputKeys;
|
|
PxU32* gInputRanks = rsDesc->inputRanks;
|
|
|
|
const PxU32 globalThreadIndex = threadIdx.x + blockDim.x*blockIdx.x;
|
|
|
|
for(PxU32 i=globalThreadIndex; i<nbThresholdElements; i+=blockDim.x*gridDim.x)
|
|
{
|
|
Dy::ThresholdStreamElement& elements = thresholdStream[i];
|
|
gInputKeys[i] = elements.nodeIndexB.index();
|
|
gInputRanks[i] = i;
|
|
}
|
|
|
|
//we need to pad the handles to the multiply of 4 for the radix sort
|
|
const PxU32 remainingThresholdElements = (4 - (nbThresholdElements & 3)) & 3;
|
|
|
|
for(PxU32 i=globalThreadIndex; i < remainingThresholdElements; i+=blockDim.x*gridDim.x)
|
|
{
|
|
const PxU32 index = i + nbThresholdElements;
|
|
gInputKeys[index] = 0xffffffff;
|
|
gInputRanks[index] = index;
|
|
}
|
|
|
|
}
|
|
|
|
extern "C" __global__ void initialRanksAndBodyIndexA(const PxgSolverCoreDesc* solverDesc, const PxgRadixSortDesc* rsDesc)
|
|
{
|
|
Dy::ThresholdStreamElement* thresholdStream = solverDesc->thresholdStream;
|
|
const PxU32 nbThresholdElements = solverDesc->sharedThresholdStreamIndex;
|
|
|
|
//we need to use the inputRanks from the bodyAIndex to reorganize the threshold stream
|
|
PxU32* gInputKeys = rsDesc->inputKeys;
|
|
PxU32* gInputRanks = rsDesc->inputRanks;
|
|
|
|
const PxU32 globalThreadIndex = threadIdx.x + blockDim.x*blockIdx.x;
|
|
|
|
for(PxU32 i=globalThreadIndex; i<nbThresholdElements; i+=blockDim.x*gridDim.x)
|
|
{
|
|
Dy::ThresholdStreamElement& elements = thresholdStream[gInputRanks[i]];
|
|
gInputKeys[i] = elements.nodeIndexA.index();
|
|
}
|
|
|
|
//we need to pad the handles to the multiply of 4 for the radix sort
|
|
const PxU32 remainingThresholdElements = (4 - (nbThresholdElements & 3)) & 3;
|
|
|
|
for(PxU32 i=globalThreadIndex; i < remainingThresholdElements; i+=blockDim.x*gridDim.x)
|
|
{
|
|
const PxU32 index = i + nbThresholdElements;
|
|
gInputKeys[index] = 0xffffffff;
|
|
}
|
|
|
|
}
|
|
|
|
extern "C" __global__ void reorganizeThresholdElements(const PxgSolverCoreDesc* solverDesc, const PxgRadixSortDesc* rsDesc)
|
|
{
|
|
Dy::ThresholdStreamElement* thresholdStream = solverDesc->thresholdStream;
|
|
Dy::ThresholdStreamElement* tmpThresholdStream = solverDesc->tmpThresholdStream;
|
|
|
|
const PxU32 nbThresholdElements = solverDesc->sharedThresholdStreamIndex;
|
|
|
|
const PxU32* gInputRanks = rsDesc->inputRanks;
|
|
PxU32 globalThreadIdx = threadIdx.x + blockIdx.x * blockDim.x;
|
|
|
|
for(PxU32 i = globalThreadIdx/8; i < nbThresholdElements; i+=((blockDim.x*gridDim.x)/8))
|
|
{
|
|
PxU32* dest = reinterpret_cast<PxU32*>(thresholdStream + i);
|
|
PxU32* src = reinterpret_cast<PxU32*>(tmpThresholdStream + gInputRanks[i]);
|
|
|
|
dest[threadIdx.x&7] = src[threadIdx.x&7];
|
|
}
|
|
|
|
}
|
|
|
|
extern "C" __global__ void computeAccumulateThresholdStream(PxgSolverCoreDesc* solverDesc)
|
|
{
|
|
const PxU32 nbBlocks = PxgKernelGridDim::COMPUTE_ACCUMULATED_THRESHOLDSTREAM;
|
|
PX_COMPILE_TIME_ASSERT(nbBlocks == 32);
|
|
|
|
const PxU32 WARP_PERBLOCK_SIZE = PxgKernelBlockDim::COMPUTE_ACCUMULATED_THRESHOLDSTREAM / WARP_SIZE;
|
|
|
|
const PxU32 LOG2_WARP_PERBLOCK_SIZE = 3;
|
|
|
|
assert(WARP_PERBLOCK_SIZE == (1 << LOG2_WARP_PERBLOCK_SIZE));
|
|
|
|
__shared__ PxReal sWarpAccumulator[WARP_PERBLOCK_SIZE];
|
|
__shared__ PxReal sBlockAccumulator;
|
|
|
|
__shared__ PxU32 sWarpPairsAccumulator[WARP_PERBLOCK_SIZE];
|
|
__shared__ PxU32 sBlockPairsAccumulator;
|
|
|
|
//Each body can be made of multiple shapes, therefore, we need to accumulated difference forces from the shapes to the body pairs. In this case, we will have thresholdStreams have
|
|
//same bodyAIndex and bodyBIndex
|
|
|
|
//The threshold stream has been sorted based on the bodyAIndex and bodyBIndex, therefore, if pairs have the same bodyAIndex and bodyBIndex, they will laied in continuously memory
|
|
Dy::ThresholdStreamElement* gThresholdStream = solverDesc->thresholdStream;
|
|
|
|
PxReal* gThresholdStreamAccumulatedForce = solverDesc->thresholdStreamAccumulatedForce;
|
|
PxReal* gThresholdStreamAccumulatedForceBetweenBlocks = solverDesc->thresholdStreamAccumulatedForceBetweenBlocks;
|
|
|
|
PxU32* gThresholdStreamWriteIndex = solverDesc->thresholdStreamWriteIndex;
|
|
PxU32* gThresholdStreamWriteIndexBetweenBlocks = solverDesc->thresholdStreamWriteIndexBetweenBlocks;
|
|
|
|
bool* gThresholdStreamWriteable = solverDesc->thresholdStreamWriteable;
|
|
|
|
const PxU32 nbThresholdElements = solverDesc->sharedThresholdStreamIndex;
|
|
|
|
const PxU32 totalBlockRequired = (nbThresholdElements + (blockDim.x-1))/ blockDim.x;
|
|
|
|
const PxU32 numIterationPerBlock = (totalBlockRequired + (nbBlocks-1))/ nbBlocks;
|
|
|
|
const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE-1);
|
|
const PxU32 warpIndex = threadIdx.x/(WARP_SIZE);
|
|
|
|
const PxU32 idx = threadIdx.x;
|
|
|
|
if(threadIdx.x == 0)
|
|
{
|
|
sBlockAccumulator = 0;
|
|
sBlockPairsAccumulator = 0;
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
for(PxU32 i=0; i<numIterationPerBlock; ++i)
|
|
{
|
|
const PxU32 workIndex = idx + i*blockDim.x + blockDim.x * blockIdx.x *numIterationPerBlock;
|
|
|
|
PxReal val = 0.f;
|
|
bool isNewPair = false;
|
|
PxNodeIndex nodeIndexA(PX_INVALID_NODE);
|
|
PxNodeIndex nodeIndexB(PX_INVALID_NODE);
|
|
|
|
if(workIndex < nbThresholdElements)
|
|
{
|
|
val = gThresholdStream[workIndex].normalForce;
|
|
nodeIndexA = gThresholdStream[workIndex].nodeIndexA;
|
|
nodeIndexB = gThresholdStream[workIndex].nodeIndexB;
|
|
|
|
if(workIndex+1 < nbThresholdElements)
|
|
{
|
|
Dy::ThresholdStreamElement& nElement = gThresholdStream[workIndex+1];
|
|
if(!(nodeIndexA == nElement.nodeIndexA && nodeIndexB == nElement.nodeIndexB))
|
|
{
|
|
isNewPair = true;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
isNewPair = true;
|
|
}
|
|
}
|
|
|
|
//warpScan is inclusive add but the accumVal is exclusive add result
|
|
const PxReal accumVal = warpScan<AddOpPxReal, PxReal>(FULL_MASK, val) - val;
|
|
|
|
const PxU32 threadMask = (1<<threadIndexInWarp)-1;
|
|
|
|
const PxU32 accumPairs = __popc(__ballot_sync(FULL_MASK, isNewPair)&threadMask);
|
|
|
|
if(threadIndexInWarp == (WARP_SIZE-1))
|
|
{
|
|
sWarpAccumulator[warpIndex] = accumVal + val;
|
|
sWarpPairsAccumulator[warpIndex] = accumPairs + isNewPair;
|
|
}
|
|
|
|
const PxReal prevBlockAccumulator = sBlockAccumulator;
|
|
const PxU32 prevsBlockPairsAccumulator = sBlockPairsAccumulator;
|
|
|
|
__syncthreads();
|
|
|
|
unsigned mask_idx = __ballot_sync(FULL_MASK, idx < WARP_PERBLOCK_SIZE);
|
|
if(idx < WARP_PERBLOCK_SIZE)
|
|
{
|
|
PxReal forceVal = sWarpAccumulator[idx];
|
|
|
|
const PxReal accumulatedForce = warpScan<AddOpPxReal, PxReal, LOG2_WARP_PERBLOCK_SIZE>(mask_idx, forceVal) - forceVal;
|
|
sWarpAccumulator[idx] = accumulatedForce;
|
|
|
|
PxU32 pairVal = sWarpPairsAccumulator[idx];
|
|
const PxU32 accumulatedPairs = warpScan<AddOpPxU32, PxU32, LOG2_WARP_PERBLOCK_SIZE>(mask_idx, pairVal) - pairVal;
|
|
sWarpPairsAccumulator[idx] = accumulatedPairs;
|
|
|
|
if(threadIndexInWarp == (WARP_PERBLOCK_SIZE-1))
|
|
{
|
|
sBlockAccumulator += (accumulatedForce + forceVal);
|
|
sBlockPairsAccumulator +=(accumulatedPairs + pairVal);
|
|
}
|
|
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
if(workIndex < nbThresholdElements)
|
|
{
|
|
//accumVal is exclusive result within a warp and sWarpAccumulator is the exclusive result within a block
|
|
gThresholdStreamAccumulatedForce[workIndex] = val + accumVal + prevBlockAccumulator + sWarpAccumulator[warpIndex]; //this is inclusive
|
|
gThresholdStreamWriteIndex[workIndex] = accumPairs + prevsBlockPairsAccumulator + sWarpPairsAccumulator[warpIndex];
|
|
gThresholdStreamWriteable[workIndex] = isNewPair;
|
|
}
|
|
|
|
}
|
|
|
|
if(threadIdx.x == 0)
|
|
{
|
|
gThresholdStreamAccumulatedForceBetweenBlocks[blockIdx.x] = sBlockAccumulator;
|
|
gThresholdStreamWriteIndexBetweenBlocks[blockIdx.x] = sBlockPairsAccumulator;
|
|
}
|
|
|
|
}
|
|
|
|
extern "C" __global__ void outputAccumulateThresholdStream(PxgSolverCoreDesc* solverDesc)
|
|
{
|
|
const PxU32 nbBlocks = PxgKernelGridDim::OUTPUT_ACCUMULATED_THRESHOLDSTREAM;
|
|
PX_COMPILE_TIME_ASSERT(nbBlocks == 32);
|
|
|
|
PxReal* gThresholdStreamAccumulatedForce = solverDesc->thresholdStreamAccumulatedForce;
|
|
PxReal* gThresholdStreamAccumulatedForceBetweenBlocks = solverDesc->thresholdStreamAccumulatedForceBetweenBlocks;
|
|
|
|
PxU32* gThresholdStreamWriteIndex = solverDesc->thresholdStreamWriteIndex;
|
|
PxU32* gThresholdStreamWriteIndexBetweenBlocks = solverDesc->thresholdStreamWriteIndexBetweenBlocks;
|
|
|
|
|
|
const PxU32 nbThresholdElements = solverDesc->sharedThresholdStreamIndex;
|
|
|
|
__shared__ PxReal sBlockForceAccum[nbBlocks];
|
|
__shared__ PxU32 sBlockWriteIndexAccum[nbBlocks];
|
|
|
|
const PxU32 idx = threadIdx.x;
|
|
// const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE-1);
|
|
|
|
PxReal val = 0;
|
|
PxReal res = 0;
|
|
PxU32 pairIndex = 0;
|
|
PxU32 pairRes = 0;
|
|
|
|
unsigned mask_idx = __ballot_sync(FULL_MASK, idx < nbBlocks);
|
|
if(idx < nbBlocks)
|
|
{
|
|
val = gThresholdStreamAccumulatedForceBetweenBlocks[idx];
|
|
pairIndex = gThresholdStreamWriteIndexBetweenBlocks[idx];
|
|
|
|
res = warpScan<AddOpPxReal, PxReal>(mask_idx, val) - val;
|
|
pairRes = warpScan<AddOpPxU32, PxU32>(mask_idx, pairIndex) - pairIndex;
|
|
|
|
sBlockForceAccum[idx] = res;
|
|
sBlockWriteIndexAccum[idx] = pairRes;
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
const PxU32 totalBlockRequired = (nbThresholdElements + (blockDim.x-1))/ blockDim.x;
|
|
|
|
const PxU32 numIterationPerBlock = (totalBlockRequired + (nbBlocks-1))/ nbBlocks;
|
|
|
|
const PxReal blockForceAccum = sBlockForceAccum[blockIdx.x];
|
|
|
|
const PxU32 blockWriteIndexAccum = sBlockWriteIndexAccum[blockIdx.x];
|
|
|
|
//accumulate normal force between blocks
|
|
for(PxU32 i=0; i<numIterationPerBlock; ++i)
|
|
{
|
|
const PxU32 workIndex = i * blockDim.x + idx + numIterationPerBlock * blockIdx.x * blockDim.x;
|
|
|
|
if(workIndex < nbThresholdElements)
|
|
{
|
|
gThresholdStreamWriteIndex[workIndex] = gThresholdStreamWriteIndex[workIndex] + blockWriteIndexAccum;
|
|
|
|
gThresholdStreamAccumulatedForce[workIndex] = gThresholdStreamAccumulatedForce[workIndex] + blockForceAccum;
|
|
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
extern "C" __global__ void writeoutAccumulatedForcePerObject(PxgSolverCoreDesc* solverDesc)
|
|
{
|
|
|
|
PxReal* gAccumulatedForces = solverDesc->thresholdStreamAccumulatedForce;
|
|
PxU32* gThresholdStreamWriteIndex = solverDesc->thresholdStreamWriteIndex;
|
|
bool* gThresholdStreamWriteable = solverDesc->thresholdStreamWriteable;
|
|
|
|
PxReal* gAccumulatedForceObjectPairs = solverDesc->accumulatedForceObjectPairs;
|
|
|
|
const PxU32 nbThresholdElements = solverDesc->sharedThresholdStreamIndex;
|
|
|
|
const PxU32 globalThreadIdx = threadIdx.x + blockIdx.x * blockDim.x;
|
|
|
|
//accumulate normal force between blocks
|
|
for(PxU32 workIndex = globalThreadIdx; workIndex < nbThresholdElements; workIndex+=(blockDim.x*gridDim.x))
|
|
{
|
|
const PxU32 writeIndex = gThresholdStreamWriteIndex[workIndex];
|
|
|
|
bool isNewPairs = gThresholdStreamWriteable[workIndex];
|
|
|
|
if(isNewPairs)
|
|
{
|
|
gAccumulatedForceObjectPairs[writeIndex] = gAccumulatedForces[workIndex];
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
extern "C" __global__ void computeExceededForceThresholdElementIndice(PxgSolverCoreDesc* solverDesc,
|
|
PxgSolverSharedDesc<IterativeSolveData>* sharedDesc)
|
|
{
|
|
const PxU32 nbBlocks = PxgKernelGridDim::COMPUTE_EXCEEDEDFORCE_THRESHOLDELEMENT_INDICE;
|
|
PX_COMPILE_TIME_ASSERT(nbBlocks == 32);
|
|
|
|
const PxU32 WARP_PERBLOCK_SIZE = PxgKernelBlockDim::COMPUTE_EXCEEDEDFORCE_THRESHOLDELEMENT_INDICE / WARP_SIZE;
|
|
|
|
const PxU32 LOG2_WARP_PERBLOCK_SIZE = 3;
|
|
|
|
assert((1 << LOG2_WARP_PERBLOCK_SIZE) == WARP_PERBLOCK_SIZE);
|
|
|
|
__shared__ PxU32 sWarpPairsAccumulator[WARP_PERBLOCK_SIZE];
|
|
__shared__ PxU32 sBlockPairsAccumulator;
|
|
|
|
const PxReal dt = sharedDesc->dt;
|
|
|
|
Dy::ThresholdStreamElement* gThresholdStream = solverDesc->thresholdStream;
|
|
PxReal* gAccumulatedForceObjectPairs = solverDesc->accumulatedForceObjectPairs;
|
|
|
|
PxU32* gThresholdStreamWriteIndex = solverDesc->thresholdStreamWriteIndex;
|
|
PxU32* gThresholdStreamWriteIndexBetweenBlocks = solverDesc->thresholdStreamWriteIndexBetweenBlocks;
|
|
bool* gThresholdStreamWriteable = solverDesc->thresholdStreamWriteable;
|
|
|
|
const PxU32 nbThresholdElements = solverDesc->sharedThresholdStreamIndex;
|
|
|
|
const PxU32 totalBlockRequired = (nbThresholdElements + (blockDim.x-1))/ blockDim.x;
|
|
|
|
const PxU32 numIterationPerBlock = (totalBlockRequired + (nbBlocks-1))/ nbBlocks;
|
|
|
|
const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE-1);
|
|
const PxU32 warpIndex = threadIdx.x/(WARP_SIZE);
|
|
|
|
const PxU32 idx = threadIdx.x;
|
|
|
|
if(threadIdx.x == 0)
|
|
{
|
|
sBlockPairsAccumulator = 0;
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
for(PxU32 i=0; i<numIterationPerBlock; ++i)
|
|
{
|
|
const PxU32 workIndex = idx + i*blockDim.x + blockDim.x * blockIdx.x *numIterationPerBlock;
|
|
|
|
bool isExceededForce = false;
|
|
|
|
if(workIndex < nbThresholdElements)
|
|
{
|
|
Dy::ThresholdStreamElement& element = gThresholdStream[workIndex];
|
|
|
|
//we are reusing the write index buffer. However, because the work index is the same, so as long as we read before we write, it should be safe
|
|
const PxU32 writeIndex = gThresholdStreamWriteIndex[workIndex];
|
|
PxReal accumulatedForce = gAccumulatedForceObjectPairs[writeIndex];
|
|
|
|
if(writeIndex > 0)
|
|
{
|
|
accumulatedForce -= gAccumulatedForceObjectPairs[writeIndex-1];
|
|
}
|
|
|
|
//write back the accumulated force
|
|
element.accumulatedForce = accumulatedForce;
|
|
|
|
isExceededForce = accumulatedForce > (element.threshold * dt);
|
|
}
|
|
|
|
const PxU32 threadMask = (1<<threadIndexInWarp)-1;
|
|
|
|
const PxU32 accumPairs = __popc(__ballot_sync(FULL_MASK, isExceededForce)&threadMask);
|
|
|
|
if(threadIndexInWarp == (WARP_SIZE-1))
|
|
{
|
|
sWarpPairsAccumulator[warpIndex] = accumPairs + isExceededForce;
|
|
}
|
|
|
|
const PxU32 prevsBlockPairsAccumulator = sBlockPairsAccumulator;
|
|
|
|
__syncthreads();
|
|
|
|
unsigned mask_idx = __ballot_sync(FULL_MASK, idx < WARP_PERBLOCK_SIZE);
|
|
if(idx < WARP_PERBLOCK_SIZE)
|
|
{
|
|
|
|
PxU32 pairVal = sWarpPairsAccumulator[idx];
|
|
const PxU32 accumulatedPairs = warpScan<AddOpPxU32, PxU32, LOG2_WARP_PERBLOCK_SIZE>(mask_idx, pairVal) - pairVal;
|
|
sWarpPairsAccumulator[idx] = accumulatedPairs;
|
|
|
|
if(threadIndexInWarp == (WARP_PERBLOCK_SIZE-1))
|
|
{
|
|
sBlockPairsAccumulator +=(accumulatedPairs + pairVal);
|
|
}
|
|
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
if(workIndex < nbThresholdElements)
|
|
{
|
|
gThresholdStreamWriteIndex[workIndex] = accumPairs + prevsBlockPairsAccumulator + sWarpPairsAccumulator[warpIndex];
|
|
gThresholdStreamWriteable[workIndex] = isExceededForce;
|
|
}
|
|
|
|
}
|
|
|
|
if(threadIdx.x == 0)
|
|
{
|
|
gThresholdStreamWriteIndexBetweenBlocks[blockIdx.x] = sBlockPairsAccumulator;
|
|
}
|
|
|
|
}
|
|
|
|
extern "C" __global__ void outputExceededForceThresholdElementIndice(PxgSolverCoreDesc* solverDesc)
|
|
{
|
|
const PxU32 nbBlocks = PxgKernelGridDim::OUTPUT_EXCEEDEDFORCE_THRESHOLDELEMENT_INDICE;
|
|
PX_COMPILE_TIME_ASSERT(nbBlocks == 32);
|
|
|
|
const PxU32 WARP_PERBLOCK_SIZE = PxgKernelBlockDim::OUTPUT_EXCEEDEDFORCE_THRESHOLDELEMENT_INDICE/WARP_SIZE;
|
|
|
|
PxU32* gThresholdStreamWriteIndex = solverDesc->thresholdStreamWriteIndex;
|
|
PxU32* gThresholdStreamWriteIndexBetweenBlocks = solverDesc->thresholdStreamWriteIndexBetweenBlocks;
|
|
|
|
const PxU32 nbThresholdElements = solverDesc->sharedThresholdStreamIndex;
|
|
bool* gThresholdStreamWriteable = solverDesc->thresholdStreamWriteable;
|
|
Dy::ThresholdStreamElement* gThresholdElements = solverDesc->thresholdStream;
|
|
Dy::ThresholdStreamElement* gExceededForceElements = solverDesc->exceededForceElements;
|
|
|
|
__shared__ PxU32 sBlockWriteIndexAccum[nbBlocks];
|
|
|
|
const PxU32 idx = threadIdx.x;
|
|
// const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE-1);
|
|
|
|
PxU32 pairIndex = 0;
|
|
PxU32 pairRes = 0;
|
|
|
|
unsigned mask_idx = __ballot_sync(FULL_MASK, idx < nbBlocks);
|
|
if(idx < nbBlocks)
|
|
{
|
|
pairIndex = gThresholdStreamWriteIndexBetweenBlocks[idx];
|
|
pairRes = warpScan<AddOpPxU32, PxU32>(mask_idx, pairIndex) - pairIndex;
|
|
|
|
sBlockWriteIndexAccum[idx] = pairRes;
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
const PxU32 totalBlockRequired = (nbThresholdElements + (blockDim.x-1))/ blockDim.x;
|
|
|
|
const PxU32 numIterationPerBlock = (totalBlockRequired + (nbBlocks-1))/ nbBlocks;
|
|
|
|
const PxU32 blockWriteIndexAccum = sBlockWriteIndexAccum[blockIdx.x];
|
|
|
|
//accumulate normal force between blocks
|
|
for(PxU32 i=0; i<numIterationPerBlock; ++i)
|
|
{
|
|
const PxU32 workIndex = i*WARP_SIZE*WARP_PERBLOCK_SIZE + idx + numIterationPerBlock * blockIdx.x * blockDim.x;
|
|
|
|
if(workIndex < nbThresholdElements)
|
|
{
|
|
///gThresholdStreamWriteIndex[workIndex] = gThresholdStreamWriteIndex[workIndex] + blockWriteIndexAccum;
|
|
|
|
const PxU32 writeIndex = gThresholdStreamWriteIndex[workIndex] + blockWriteIndexAccum;
|
|
|
|
bool isExceededForce = gThresholdStreamWriteable[workIndex];
|
|
|
|
if(isExceededForce)
|
|
{
|
|
Dy::ThresholdStreamElement& element = gThresholdElements[workIndex];
|
|
Dy::ThresholdStreamElement tempElement;
|
|
tempElement.shapeInteraction = element.shapeInteraction;
|
|
tempElement.nodeIndexA = element.nodeIndexA;
|
|
tempElement.nodeIndexB = element.nodeIndexB;
|
|
tempElement.normalForce = element.normalForce;
|
|
tempElement.accumulatedForce = element.accumulatedForce;
|
|
tempElement.threshold = element.threshold;
|
|
gExceededForceElements[writeIndex] = tempElement;
|
|
}
|
|
|
|
//last element
|
|
if(workIndex == nbThresholdElements -1)
|
|
{
|
|
solverDesc->nbExceededThresholdElements = isExceededForce ? (writeIndex + 1) : writeIndex;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
//we will insure nbPrevExceededThresholdPairs > 0 in the CPU and all pair masks have been set to 1
|
|
//The data laied out in gExceededForceElementMask is previous exceeded threshold element mask first, then current exceeded threshold element mask second.
|
|
//persistent exceeded threshold element mask last. Persistent exceeded elements have to be in the previous exceeded threshold elements array and current
|
|
//exceeded threshold elements array so the number of persistent exceeded elements will be less than or equal to the previous exceeded threshold elements.
|
|
//Therefore, Persistent exceeded thresold element maks has the same size as the previous exceeded threshold and corresponding to the same element as in
|
|
//previous exceeded force element.
|
|
extern "C" __global__ void setThresholdElementsMask(PxgSolverCoreDesc* solverDesc)
|
|
{
|
|
Dy::ThresholdStreamElement* gExceededForceElements = solverDesc->exceededForceElements;
|
|
Dy::ThresholdStreamElement* gPrevExceededForceElements = solverDesc->prevExceededForceElements;
|
|
PxU32* gExceededForceElementMask = solverDesc->thresholdStreamWriteIndex;
|
|
|
|
const PxU32 nbExceededThresholdElements = solverDesc->nbExceededThresholdElements;
|
|
const PxU32 nbPrevExceededThresholdElements = solverDesc->nbPrevExceededThresholdElements;
|
|
|
|
const PxU32 globalThreadIdx = threadIdx.x + blockIdx.x * blockDim.x;
|
|
|
|
for(PxU32 workIndex = globalThreadIdx; workIndex < nbExceededThresholdElements; workIndex+=(blockDim.x*gridDim.x))
|
|
{
|
|
Dy::ThresholdStreamElement& element = gExceededForceElements[workIndex];
|
|
|
|
//this will find the last element match the element if value exist in the array
|
|
PxU32 pos = binarySearch<Dy::ThresholdStreamElement>(gPrevExceededForceElements, nbPrevExceededThresholdElements, element);
|
|
Dy::ThresholdStreamElement* prePair = &gPrevExceededForceElements[pos];
|
|
|
|
bool done = false;
|
|
|
|
while (!done)
|
|
{
|
|
done = true;
|
|
if (prePair->nodeIndexA == element.nodeIndexA && prePair->nodeIndexB == element.nodeIndexB)
|
|
{
|
|
if (prePair->shapeInteraction == element.shapeInteraction)
|
|
{
|
|
//found a pair, raise 0 in the masks so that we won't generate any force change event. Because the mask array store previous and current exceeded force pairs, we need to
|
|
// raise 0 in two position: one for the previous mask and one for the current mask
|
|
gExceededForceElementMask[pos] = 0;
|
|
gExceededForceElementMask[nbPrevExceededThresholdElements + workIndex] = 0;
|
|
}
|
|
else if (pos > 1)
|
|
{
|
|
pos = pos - 1;
|
|
prePair = &gPrevExceededForceElements[pos];
|
|
done = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
__device__ void setPersistentForceElementMask(PxgSolverCoreDesc* solverDesc)
|
|
{
|
|
PxU32* gExceededForceElementMask = solverDesc->thresholdStreamWriteIndex;
|
|
|
|
const PxU32 nbExceededThresholdElements = solverDesc->nbExceededThresholdElements;
|
|
const PxU32 nbPrevExceededThresholdElements = solverDesc->nbPrevExceededThresholdElements;
|
|
|
|
const PxU32 globalThreadIdx = threadIdx.x + blockIdx.x * blockDim.x;
|
|
|
|
const PxU32 persistentExceededStart = nbPrevExceededThresholdElements + nbExceededThresholdElements;
|
|
|
|
for (PxU32 workIndex = globalThreadIdx; workIndex < nbPrevExceededThresholdElements; workIndex += (blockDim.x*gridDim.x))
|
|
{
|
|
//based on the previous exceeded force elements
|
|
gExceededForceElementMask[persistentExceededStart + workIndex] = !gExceededForceElementMask[workIndex];
|
|
}
|
|
}
|
|
|
|
extern "C" __global__ void computeThresholdElementMaskIndices(PxgSolverCoreDesc* solverDesc)
|
|
{
|
|
//this function should be called in setThresholdElementsMask. However, if there are no preExceededThresholdElements(which we will know in the CPU code), we don't
|
|
//kick of setThresholdElementMask kernel at all so the persistentExceededThresholdElementMask is still set to be one. Therefore, we need to call the setPersistentForceElement
|
|
//method in here
|
|
|
|
const PxU32 nbBlocks = PxgKernelGridDim::COMPUTE_THRESHOLDELEMENT_MASK_INDICES;
|
|
PX_COMPILE_TIME_ASSERT(nbBlocks == 32);
|
|
|
|
const PxU32 WARP_PERBLOCK_SIZE = PxgKernelBlockDim::COMPUTE_THRESHOLDELEMENT_MASK_INDICES / WARP_SIZE;
|
|
|
|
const PxU32 LOG2_WARP_PERBLOCK_SIZE = 3;
|
|
|
|
assert((1 << LOG2_WARP_PERBLOCK_SIZE) == WARP_PERBLOCK_SIZE);
|
|
|
|
setPersistentForceElementMask(solverDesc);
|
|
|
|
__shared__ PxU32 sWarpPairsAccumulator[WARP_PERBLOCK_SIZE];
|
|
__shared__ PxU32 sBlockPairsAccumulator;
|
|
|
|
PxU32* gExceededForceElementMask = solverDesc->thresholdStreamWriteIndex;
|
|
PxU32* gExceededForceElementMaskBetweenBlocks = solverDesc->thresholdStreamWriteIndexBetweenBlocks;
|
|
bool* gThresholdStreamWriteable = solverDesc->thresholdStreamWriteable;
|
|
|
|
const PxU32 nbExceededThresholdElements = solverDesc->nbExceededThresholdElements;
|
|
const PxU32 nbPrevExceededThresholdElements = solverDesc->nbPrevExceededThresholdElements;
|
|
|
|
//prev, current and persistent
|
|
const PxU32 totalNbExceededThresholdElements = nbExceededThresholdElements + nbPrevExceededThresholdElements*2;
|
|
|
|
const PxU32 totalBlockRequired = (totalNbExceededThresholdElements + (blockDim.x-1))/ blockDim.x;
|
|
|
|
const PxU32 numIterationPerBlock = (totalBlockRequired + (nbBlocks-1))/ nbBlocks;
|
|
|
|
const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE-1);
|
|
const PxU32 warpIndex = threadIdx.x/(WARP_SIZE);
|
|
|
|
const PxU32 idx = threadIdx.x;
|
|
|
|
if(threadIdx.x == 0)
|
|
{
|
|
sBlockPairsAccumulator = 0;
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
|
|
for(PxU32 i=0; i<numIterationPerBlock; ++i)
|
|
{
|
|
const PxU32 workIndex = idx + i*blockDim.x + blockDim.x * blockIdx.x *numIterationPerBlock;
|
|
|
|
PxU32 forceChangeMask = 0;
|
|
|
|
if(workIndex < totalNbExceededThresholdElements)
|
|
{
|
|
forceChangeMask = gExceededForceElementMask[workIndex];
|
|
}
|
|
|
|
const PxU32 threadMask = (1<<threadIndexInWarp)-1;
|
|
|
|
const PxU32 accumPairs = __popc(__ballot_sync(FULL_MASK, forceChangeMask)&threadMask);
|
|
|
|
if(threadIndexInWarp == (WARP_SIZE-1))
|
|
{
|
|
sWarpPairsAccumulator[warpIndex] = accumPairs + forceChangeMask;
|
|
}
|
|
|
|
const PxU32 prevsBlockPairsAccumulator = sBlockPairsAccumulator;
|
|
|
|
__syncthreads();
|
|
|
|
unsigned mask_idx = __ballot_sync(FULL_MASK, idx < WARP_PERBLOCK_SIZE);
|
|
if(idx < WARP_PERBLOCK_SIZE)
|
|
{
|
|
PxU32 pairVal = sWarpPairsAccumulator[idx];
|
|
const PxU32 accumulatedPairs = warpScan<AddOpPxU32, PxU32, LOG2_WARP_PERBLOCK_SIZE>(mask_idx, pairVal) - pairVal;
|
|
sWarpPairsAccumulator[idx] = accumulatedPairs;
|
|
|
|
if(threadIndexInWarp == (WARP_PERBLOCK_SIZE-1))
|
|
{
|
|
sBlockPairsAccumulator +=(accumulatedPairs + pairVal);
|
|
}
|
|
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
if(workIndex < totalNbExceededThresholdElements)
|
|
{
|
|
gExceededForceElementMask[workIndex] = accumPairs + prevsBlockPairsAccumulator + sWarpPairsAccumulator[warpIndex];
|
|
gThresholdStreamWriteable[workIndex] = !!(forceChangeMask);
|
|
}
|
|
|
|
}
|
|
|
|
if(threadIdx.x == 0)
|
|
{
|
|
gExceededForceElementMaskBetweenBlocks[blockIdx.x] = sBlockPairsAccumulator;
|
|
}
|
|
}
|
|
|
|
extern "C" __global__ void outputThresholdPairsMaskIndices(PxgSolverCoreDesc* solverDesc)
|
|
{
|
|
const PxU32 nbBlocks = PxgKernelGridDim::OUTPUT_THRESHOLDELEMENT_MASK_INDICES;
|
|
PX_COMPILE_TIME_ASSERT(nbBlocks == 32);
|
|
|
|
__shared__ PxU32 sBlockWriteIndexAccum[nbBlocks];
|
|
|
|
PxU32* gExceededForceElementMask = solverDesc->thresholdStreamWriteIndex;
|
|
PxU32* gExceededForceElementMaskBetweenBlocks = solverDesc->thresholdStreamWriteIndexBetweenBlocks;
|
|
|
|
|
|
const PxU32 nbExceededThresholdElements = solverDesc->nbExceededThresholdElements;
|
|
const PxU32 nbPrevExceededThresholdElements = solverDesc->nbPrevExceededThresholdElements;
|
|
|
|
//previous, current and persistent
|
|
const PxU32 totalNbExceededThresholdElements = nbExceededThresholdElements + nbPrevExceededThresholdElements*2;
|
|
|
|
const PxU32 idx = threadIdx.x;
|
|
|
|
|
|
PxU32 pairIndex = 0;
|
|
PxU32 pairRes = 0;
|
|
|
|
unsigned mask_idx = __ballot_sync(FULL_MASK, idx < nbBlocks);
|
|
if(idx < nbBlocks)
|
|
{
|
|
pairIndex = gExceededForceElementMaskBetweenBlocks[idx];
|
|
pairRes = warpScan<AddOpPxU32, PxU32>(mask_idx, pairIndex) - pairIndex;
|
|
|
|
sBlockWriteIndexAccum[idx] = pairRes;
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
const PxU32 totalBlockRequired = (totalNbExceededThresholdElements + (blockDim.x-1))/ blockDim.x;
|
|
|
|
const PxU32 numIterationPerBlock = (totalBlockRequired + (nbBlocks-1))/ nbBlocks;
|
|
|
|
const PxU32 blockWriteIndexAccum = sBlockWriteIndexAccum[blockIdx.x];
|
|
|
|
|
|
//accumulate normal force between blocks
|
|
for(PxU32 i=0; i<numIterationPerBlock; ++i)
|
|
{
|
|
const PxU32 workIndex = i*PxgKernelBlockDim::OUTPUT_THRESHOLDELEMENT_MASK_INDICES + idx + numIterationPerBlock * blockIdx.x * blockDim.x;
|
|
|
|
if(workIndex < totalNbExceededThresholdElements)
|
|
{
|
|
gExceededForceElementMask[workIndex] = gExceededForceElementMask[workIndex] + blockWriteIndexAccum;
|
|
}
|
|
}
|
|
}
|
|
|
|
extern "C" __global__ void createForceChangeThresholdElements(PxgSolverCoreDesc* solverDesc)
|
|
{
|
|
PxU32* gExceededForceElementMask = solverDesc->thresholdStreamWriteIndex;
|
|
Dy::ThresholdStreamElement* gExceededForceElements = solverDesc->exceededForceElements;
|
|
Dy::ThresholdStreamElement* gPrevExceededForceElements = solverDesc->prevExceededForceElements;
|
|
Dy::ThresholdStreamElement* gForceChangeElements = solverDesc->forceChangeThresholdElements;
|
|
|
|
//we copy the original mask value to thresholdStreamWriteable in computeThresholdElementMaskIndices so it corresponding with mask
|
|
bool* gThresholdStreamWriteable = solverDesc->thresholdStreamWriteable;
|
|
const PxU32 globalThreadIdx = threadIdx.x + blockIdx.x * blockDim.x;
|
|
const PxU32 nbExceededThresholdElements = solverDesc->nbExceededThresholdElements;
|
|
const PxU32 nbPrevExceededThresholdElements = solverDesc->nbPrevExceededThresholdElements;
|
|
|
|
//previous, current and persistent
|
|
const PxU32 totalNbExceededThresholdElements = nbExceededThresholdElements + nbPrevExceededThresholdElements*2;
|
|
const PxU32 persistentExceededStart = nbPrevExceededThresholdElements + nbExceededThresholdElements;
|
|
|
|
for(PxU32 workIndex = globalThreadIdx; workIndex < totalNbExceededThresholdElements; workIndex+=(blockDim.x*gridDim.x))
|
|
{
|
|
const bool hasForceChangeOrPersistent = gThresholdStreamWriteable[workIndex];
|
|
|
|
const PxU32 writeIndex = gExceededForceElementMask[workIndex];
|
|
|
|
if (hasForceChangeOrPersistent)
|
|
{
|
|
bool lostPair = workIndex < nbPrevExceededThresholdElements;
|
|
bool foundPair = (workIndex < persistentExceededStart) && !lostPair;
|
|
|
|
Dy::ThresholdStreamElement* pair = NULL;
|
|
if (lostPair)
|
|
{
|
|
pair = &gPrevExceededForceElements[workIndex];
|
|
}
|
|
else if (foundPair)
|
|
{
|
|
pair = &gExceededForceElements[workIndex - nbPrevExceededThresholdElements];
|
|
}
|
|
else
|
|
{
|
|
//persistent pair
|
|
pair = &gPrevExceededForceElements[workIndex - persistentExceededStart];
|
|
}
|
|
|
|
//Dy::ThresholdStreamElement& pair = lostPair ? gPrevExceededForceElements[workIndex] : gExceededForceElements[workIndex - nbPrevExceededThresholdElements];
|
|
|
|
Dy::ThresholdStreamElement tempPair;
|
|
tempPair.shapeInteraction = pair->shapeInteraction;
|
|
tempPair.nodeIndexA = pair->nodeIndexA;
|
|
tempPair.nodeIndexB = pair->nodeIndexB;
|
|
tempPair.normalForce = pair->normalForce;
|
|
tempPair.accumulatedForce = lostPair ? 0.f : pair->accumulatedForce;
|
|
tempPair.threshold = pair->threshold;
|
|
|
|
gForceChangeElements[writeIndex] = tempPair;
|
|
}
|
|
|
|
if(workIndex == totalNbExceededThresholdElements-1)
|
|
{
|
|
solverDesc->nbForceChangeElements = hasForceChangeOrPersistent ? (writeIndex + 1) : writeIndex;
|
|
}
|
|
}
|
|
}
|