Files
XCEngine/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/sparseGridStandalone.cu

374 lines
13 KiB
Plaintext

// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#include "vector_types.h"
#include "foundation/PxVec3.h"
#include "foundation/PxVec4.h"
#include "stdio.h"
#include "assert.h"
#include "cuda.h"
#include "sparseGridStandalone.cuh"
#define ENABLE_KERNEL_LAUNCH_ERROR_CHECK 0
#define NEW_SUBGRID 0xfffffffe
#define REUSED_SUBGRID 0xfffffffd
extern "C" __host__ void initSparseGridStandaloneKernels0() {}
extern "C" __global__ void sg_SparseGridCalcSubgridHashes(
PxSparseGridParams sparseGridParams,
PxU32* PX_RESTRICT indices,
PxU32* PX_RESTRICT hashkeyPerParticle,
const PxVec4* const PX_RESTRICT positions,
const PxU32 numParticles,
const PxU32* PX_RESTRICT phases,
const PxU32 validPhaseMask,
const PxU32* PX_RESTRICT activeIndices)
{
PxU32 p = threadIdx.x + blockIdx.x * blockDim.x;
if (p >= numParticles)
return;
if (activeIndices)
p = activeIndices[p];
const PxVec3 subgridDomainSize = getSubgridDomainSize(sparseGridParams, 0/*sparseGridParams.haloSize*/);
const PxVec3 pos = positions[p].getXYZ();
const int3 subgridId = calcSubgridId(pos, subgridDomainSize);
bool isValidPhase = phases == NULL || (phases[p] & validPhaseMask);
indices[p] = p;
hashkeyPerParticle[p] = isValidPhase ? calcSubgridHash(subgridId) : EMPTY_SUBGRID;
}
__device__ void applyMask(PxU32* mask, const PxU32* PX_RESTRICT uniqueSortedHashkey, PxU32 hashkey, PxU32 maxNumSubgrids)
{
if (hashkey == EMPTY_SUBGRID)
return;
PxU32 sortedIdx = 0;
const bool hashFound = tryFindHashkey(uniqueSortedHashkey, 27 * maxNumSubgrids, hashkey, sortedIdx);
if (hashFound)
{
if (mask[sortedIdx] == 1)
return; //Was already marked by another thread
mask[sortedIdx] = 1;
int i = sortedIdx - 1;
while (i >= 0 && uniqueSortedHashkey[i] == hashkey)
mask[i--] = 1;
i = sortedIdx + 1;
while (i < 27 * maxNumSubgrids && uniqueSortedHashkey[i] == hashkey)
mask[i++] = 1;
}
}
extern "C" __global__ void sg_SparseGridMarkRequiredNeighbors(
PxU32* requiredNeighborMask,
const PxU32* PX_RESTRICT uniqueSortedHashkey,
const PxSparseGridParams sparseGridParams,
PxU32 neighborhoodSize,
const PxVec4* particlePositions,
const PxU32 numParticles,
const PxU32* PX_RESTRICT phases,
const PxU32 validPhaseMask,
const PxU32* PX_RESTRICT activeIndices)
{
PxU32 i = threadIdx.x + blockIdx.x * blockDim.x;
if (i >= numParticles)
return;
if (activeIndices)
i = activeIndices[i];
if (phases && !(phases[i] & validPhaseMask))
return; //Avoid to allocate sparse grids in regions of non-fluid particles
const PxVec3 xp = particlePositions[i].getXYZ();
const PxU32 haloSize = 0; // sparseGridParams.haloSize;
const PxVec3 subgridDomainSize = getSubgridDomainSize(sparseGridParams, haloSize);
const int3 subgridId = calcSubgridId(xp, subgridDomainSize); //subgridIdsPerParticle[i]; // flipSubgridHashToId(hashkey);
const PxReal dx = sparseGridParams.gridSpacing;
const PxReal invDx = 1.0f / dx;
const PxVec3 subgridOrigin = PxVec3(
subgridId.x * dx * (sparseGridParams.subgridSizeX - 2 * haloSize),
subgridId.y * dx * (sparseGridParams.subgridSizeY - 2 * haloSize),
subgridId.z * dx * (sparseGridParams.subgridSizeZ - 2 * haloSize));
const PxVec3 localXp = xp - subgridOrigin;
int3 gridBaseCoord;
gridBaseCoord.x = PxClamp(int(floor(localXp.x * invDx)), 0, int(int(sparseGridParams.subgridSizeX) - 2 * haloSize - 1));
gridBaseCoord.y = PxClamp(int(floor(localXp.y * invDx)), 0, int(int(sparseGridParams.subgridSizeY) - 2 * haloSize - 1));
gridBaseCoord.z = PxClamp(int(floor(localXp.z * invDx)), 0, int(int(sparseGridParams.subgridSizeZ) - 2 * haloSize - 1));
//Find the neighboring subgrids (step has values -1/0/1 for x/y/z) that need to exist
int3 step;
step.x = gridBaseCoord.x < neighborhoodSize ? -1 : (gridBaseCoord.x >= sparseGridParams.subgridSizeX - 2 * haloSize - neighborhoodSize ? 1 : 0);
step.y = gridBaseCoord.y < neighborhoodSize ? -1 : (gridBaseCoord.y >= sparseGridParams.subgridSizeY - 2 * haloSize - neighborhoodSize ? 1 : 0);
step.z = gridBaseCoord.z < neighborhoodSize ? -1 : (gridBaseCoord.z >= sparseGridParams.subgridSizeZ - 2 * haloSize - neighborhoodSize ? 1 : 0);
//Mark the neighbor subgrids that need to exist such that particles with a radius >0 near the subgrid boundary can transfer their density to the grid
PxU32 buffer[8];
int indexer = 0;
buffer[indexer++] = calcSubgridHash(subgridId);
if (step.x != 0 && step.y != 0 && step.z != 0) buffer[indexer++] = subgridHashOffset(subgridId, step.x, step.y, step.z);
if (step.x != 0 && step.y != 0) buffer[indexer++] = subgridHashOffset(subgridId, step.x, step.y, 0);
if (step.x != 0 && step.z != 0) buffer[indexer++] = subgridHashOffset(subgridId, step.x, 0, step.z);
if (step.y != 0 && step.z != 0) buffer[indexer++] = subgridHashOffset(subgridId, 0, step.y, step.z);
if (step.x != 0) buffer[indexer++] = subgridHashOffset(subgridId, step.x, 0, 0);
if (step.y != 0) buffer[indexer++] = subgridHashOffset(subgridId, 0, step.y, 0);
if (step.z != 0) buffer[indexer++] = subgridHashOffset(subgridId, 0, 0, step.z);
for (int j = 0; j < indexer; ++j)
applyMask(requiredNeighborMask, uniqueSortedHashkey, buffer[j], sparseGridParams.maxNumSubgrids);
}
extern "C" __global__ void sg_SparseGridSortedArrayToDelta(
const PxU32* in,
const PxU32* mask,
PxU32* out,
PxU32 n)
{
const PxU32 i = threadIdx.x + blockIdx.x * blockDim.x;
if (i < n)
{
if (i < n - 1 && in[i] != in[i + 1])
out[i] = mask ? mask[i] : 1;
else
out[i] = 0;
if (i == n - 1)
out[i] = mask ? mask[i] : 1;
}
}
extern "C" __global__ void sg_SparseGridGetUniqueValues(
const PxU32* sortedData,
const PxU32* indices,
PxU32* uniqueValues,
const PxU32 n,
PxU32* subgridNeighborCollector,
const PxU32 uniqueValuesSize)
{
const PxU32 i = threadIdx.x + blockIdx.x * blockDim.x;
if (i < n)
{
if (i == n - 1 || indices[i] != indices[i + 1])
{
if (indices[i] < uniqueValuesSize)
{
uniqueValues[indices[i]] = sortedData[i];
if (subgridNeighborCollector)
{
int4 id = subgridHashToId(sortedData[i]);
int indexer = 27 * indices[i];
for (int i = -1; i <= 1; ++i) for (int j = -1; j <= 1; ++j) for (int k = -1; k <= 1; ++k)
subgridNeighborCollector[indexer++] = calcSubgridHash(make_int3(id.x + i, id.y + j, id.z + k));
}
}
}
}
}
extern "C" __global__ void sg_SparseGridClearDensity(
PxReal* PX_RESTRICT density,
const PxReal clearValue,
const PxU32* numActiveSubgrids,
const PxU32 subgridSize
)
{
const PxU32 idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= (numActiveSubgrids[0]) * subgridSize)
return;
density[idx] = clearValue;
}
extern "C" __global__ void sg_SparseGridBuildSubgridNeighbors(
const PxU32* PX_RESTRICT uniqueSortedHashkey,
const PxU32* PX_RESTRICT numActiveSubgrids,
const PxU32 maxNumSubgrids,
PxU32* PX_RESTRICT subgridNeighbors
)
{
const PxU32 si = blockIdx.x * blockDim.x + threadIdx.x;
if (si >= maxNumSubgrids)
return;
const PxU32 hash = uniqueSortedHashkey[si];
int4 sID = subgridHashToId(hash);
subgridNeighbors[27 * si + SUBGRID_CENTER_IDX] = si;
for (int z = -1; z <= 1; ++z) for (int y = -1; y <= 1; ++y) for (int x = -1; x <= 1; ++x)
{
const int3 nID = make_int3(sID.x + x, sID.y + y, sID.z + z);
const PxU32 nHash = calcSubgridHash(nID);
PxU32 n = EMPTY_SUBGRID;
if (isSubgridInsideRange(nID))
{
PxU32 nSortedIdx = 0;
if (tryFindHashkey(uniqueSortedHashkey, numActiveSubgrids[0]/* + 1*/, nHash, nSortedIdx))
n = nSortedIdx;
}
subgridNeighbors[27 * si + subgridNeighborIndex(x, y, z)] = n;
}
}
extern "C" __global__ void sg_MarkSubgridEndIndices(const PxU32* sortedParticleToSubgrid, PxU32 numParticles, PxU32* subgridEndIndices)
{
PxI32 threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
if (threadIndex >= numParticles)
return;
if (threadIndex < numParticles - 1)
{
if (sortedParticleToSubgrid[threadIndex] != sortedParticleToSubgrid[threadIndex + 1])
subgridEndIndices[sortedParticleToSubgrid[threadIndex]] = threadIndex + 1;
}
else
subgridEndIndices[sortedParticleToSubgrid[threadIndex]] = numParticles;
}
extern "C" __global__ void sg_ReuseSubgrids(
const PxSparseGridParams sparseGridParams,
const PxU32* uniqueHashkeysPerSubgridPreviousUpdate,
const PxU32* numActiveSubgridsPreviousUpdate,
PxU32* subgridOrderMapPreviousUpdate,
const PxU32* uniqueHashkeysPerSubgrid,
const PxU32* numActiveSubgrids,
PxU32* subgridOrderMap)
{
PxI32 threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
if (threadIndex >= sparseGridParams.maxNumSubgrids)
return;
if (threadIndex >= numActiveSubgrids[0])
{
subgridOrderMap[threadIndex] = EMPTY_SUBGRID;
return;
}
const PxU32 hashkey = uniqueHashkeysPerSubgrid[threadIndex];
PxU32 sortedIdx = 0;
const bool hashFound = tryFindHashkey(uniqueHashkeysPerSubgridPreviousUpdate, numActiveSubgridsPreviousUpdate[0], hashkey, sortedIdx);
if (!hashFound)
{
subgridOrderMap[threadIndex] = NEW_SUBGRID;
return;
}
subgridOrderMap[threadIndex] = subgridOrderMapPreviousUpdate[sortedIdx];
subgridOrderMapPreviousUpdate[sortedIdx] = REUSED_SUBGRID;
}
PX_FORCE_INLINE __device__ void addIdToUnusedSubgridStack(PxU32 idToAddToStack, PxU32* unusedSubgridStackSize, PxU32* unusedSubgridStack)
{
const PxU32 id = atomicAdd(unusedSubgridStackSize, 1);
unusedSubgridStack[id] = idToAddToStack;
}
PX_FORCE_INLINE __device__ PxU32 getSubgridIdFromUnusedStack(PxU32* unusedSubgridStackSize, PxU32* unusedSubgridStack)
{
const PxU32 id = PxU32(atomicAdd(reinterpret_cast<PxI32*>(unusedSubgridStackSize), -1));
return unusedSubgridStack[id - 1];
}
//TODO: This method uses atomics. For better debuging, it might be worth to offer a slower variant that generates 100% reproducible results
extern "C" __global__ void sg_AddReleasedSubgridsToUnusedStack(
const PxU32* numActiveSubgridsPreviousUpdate,
const PxU32* subgridOrderMapPreviousUpdate,
PxU32* unusedSubgridStackSize,
PxU32* unusedSubgridStack)
{
PxI32 threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
if (threadIndex >= numActiveSubgridsPreviousUpdate[0])
return;
if (subgridOrderMapPreviousUpdate[threadIndex] != REUSED_SUBGRID)
addIdToUnusedSubgridStack(subgridOrderMapPreviousUpdate[threadIndex], unusedSubgridStackSize, unusedSubgridStack);
}
//TODO: This method uses atomics. For better debuging, it might be worth to offer a slower variant that generates 100% reproducible results
extern "C" __global__ void sg_AllocateNewSubgrids(
const PxU32* numActiveSubgrids,
PxU32* subgridOrderMap,
PxU32* unusedSubgridStackSize,
PxU32* unusedSubgridStack,
const PxU32* numActiveSubgridsPreviousUpdate,
const PxU32 maxNumSubgrids)
{
PxI32 threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
if (threadIndex >= numActiveSubgrids[0])
return;
if (numActiveSubgridsPreviousUpdate[0] == 0)
{
PxU32 numActiveSubgridsClamped = PxMin(maxNumSubgrids, numActiveSubgrids[0]);
//Special case to simplify debugging: If no subgrids were active in the previous frame, then all subgrids present now must be new
//Make sure that the subgrid indices in the first frame are always identical. But the order might change in subsequent frames due to the use of atomics
//subgridOrderMap[threadIndex] = unusedSubgridStack[maxNumSubgrids - threadIndex - 1];
subgridOrderMap[threadIndex] = unusedSubgridStack[maxNumSubgrids - numActiveSubgridsClamped + threadIndex]; //Use this line to test with non-default subgrid order to ensure that the code does not only work with the default order
if (threadIndex == 0)
unusedSubgridStackSize[0] -= numActiveSubgridsClamped;
//If launched with 1024 threads per block, one could do per block scan and support 100% reproducible subgrid allocations using a block scan if maxNumSubgrids<=1024
}
else
{
if (subgridOrderMap[threadIndex] == NEW_SUBGRID)
{
subgridOrderMap[threadIndex] = getSubgridIdFromUnusedStack(unusedSubgridStackSize, unusedSubgridStack);
}
}
}