374 lines
13 KiB
Plaintext
374 lines
13 KiB
Plaintext
// Redistribution and use in source and binary forms, with or without
|
|
// modification, are permitted provided that the following conditions
|
|
// are met:
|
|
// * Redistributions of source code must retain the above copyright
|
|
// notice, this list of conditions and the following disclaimer.
|
|
// * Redistributions in binary form must reproduce the above copyright
|
|
// notice, this list of conditions and the following disclaimer in the
|
|
// documentation and/or other materials provided with the distribution.
|
|
// * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
// contributors may be used to endorse or promote products derived
|
|
// from this software without specific prior written permission.
|
|
//
|
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
|
|
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
//
|
|
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
|
|
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
|
|
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
|
|
|
|
#include "vector_types.h"
|
|
#include "foundation/PxVec3.h"
|
|
#include "foundation/PxVec4.h"
|
|
#include "stdio.h"
|
|
#include "assert.h"
|
|
#include "cuda.h"
|
|
|
|
#include "sparseGridStandalone.cuh"
|
|
|
|
#define ENABLE_KERNEL_LAUNCH_ERROR_CHECK 0
|
|
|
|
#define NEW_SUBGRID 0xfffffffe
|
|
#define REUSED_SUBGRID 0xfffffffd
|
|
|
|
extern "C" __host__ void initSparseGridStandaloneKernels0() {}
|
|
|
|
extern "C" __global__ void sg_SparseGridCalcSubgridHashes(
|
|
PxSparseGridParams sparseGridParams,
|
|
PxU32* PX_RESTRICT indices,
|
|
PxU32* PX_RESTRICT hashkeyPerParticle,
|
|
const PxVec4* const PX_RESTRICT positions,
|
|
const PxU32 numParticles,
|
|
const PxU32* PX_RESTRICT phases,
|
|
const PxU32 validPhaseMask,
|
|
const PxU32* PX_RESTRICT activeIndices)
|
|
{
|
|
PxU32 p = threadIdx.x + blockIdx.x * blockDim.x;
|
|
if (p >= numParticles)
|
|
return;
|
|
|
|
if (activeIndices)
|
|
p = activeIndices[p];
|
|
|
|
const PxVec3 subgridDomainSize = getSubgridDomainSize(sparseGridParams, 0/*sparseGridParams.haloSize*/);
|
|
|
|
const PxVec3 pos = positions[p].getXYZ();
|
|
const int3 subgridId = calcSubgridId(pos, subgridDomainSize);
|
|
|
|
bool isValidPhase = phases == NULL || (phases[p] & validPhaseMask);
|
|
|
|
indices[p] = p;
|
|
hashkeyPerParticle[p] = isValidPhase ? calcSubgridHash(subgridId) : EMPTY_SUBGRID;
|
|
}
|
|
|
|
__device__ void applyMask(PxU32* mask, const PxU32* PX_RESTRICT uniqueSortedHashkey, PxU32 hashkey, PxU32 maxNumSubgrids)
|
|
{
|
|
if (hashkey == EMPTY_SUBGRID)
|
|
return;
|
|
|
|
PxU32 sortedIdx = 0;
|
|
const bool hashFound = tryFindHashkey(uniqueSortedHashkey, 27 * maxNumSubgrids, hashkey, sortedIdx);
|
|
if (hashFound)
|
|
{
|
|
if (mask[sortedIdx] == 1)
|
|
return; //Was already marked by another thread
|
|
|
|
mask[sortedIdx] = 1;
|
|
|
|
int i = sortedIdx - 1;
|
|
while (i >= 0 && uniqueSortedHashkey[i] == hashkey)
|
|
mask[i--] = 1;
|
|
|
|
i = sortedIdx + 1;
|
|
while (i < 27 * maxNumSubgrids && uniqueSortedHashkey[i] == hashkey)
|
|
mask[i++] = 1;
|
|
}
|
|
}
|
|
|
|
extern "C" __global__ void sg_SparseGridMarkRequiredNeighbors(
|
|
PxU32* requiredNeighborMask,
|
|
const PxU32* PX_RESTRICT uniqueSortedHashkey,
|
|
const PxSparseGridParams sparseGridParams,
|
|
PxU32 neighborhoodSize,
|
|
const PxVec4* particlePositions,
|
|
const PxU32 numParticles,
|
|
const PxU32* PX_RESTRICT phases,
|
|
const PxU32 validPhaseMask,
|
|
const PxU32* PX_RESTRICT activeIndices)
|
|
{
|
|
PxU32 i = threadIdx.x + blockIdx.x * blockDim.x;
|
|
if (i >= numParticles)
|
|
return;
|
|
|
|
if (activeIndices)
|
|
i = activeIndices[i];
|
|
|
|
if (phases && !(phases[i] & validPhaseMask))
|
|
return; //Avoid to allocate sparse grids in regions of non-fluid particles
|
|
|
|
const PxVec3 xp = particlePositions[i].getXYZ();
|
|
|
|
const PxU32 haloSize = 0; // sparseGridParams.haloSize;
|
|
const PxVec3 subgridDomainSize = getSubgridDomainSize(sparseGridParams, haloSize);
|
|
const int3 subgridId = calcSubgridId(xp, subgridDomainSize); //subgridIdsPerParticle[i]; // flipSubgridHashToId(hashkey);
|
|
const PxReal dx = sparseGridParams.gridSpacing;
|
|
const PxReal invDx = 1.0f / dx;
|
|
|
|
const PxVec3 subgridOrigin = PxVec3(
|
|
subgridId.x * dx * (sparseGridParams.subgridSizeX - 2 * haloSize),
|
|
subgridId.y * dx * (sparseGridParams.subgridSizeY - 2 * haloSize),
|
|
subgridId.z * dx * (sparseGridParams.subgridSizeZ - 2 * haloSize));
|
|
const PxVec3 localXp = xp - subgridOrigin;
|
|
|
|
int3 gridBaseCoord;
|
|
gridBaseCoord.x = PxClamp(int(floor(localXp.x * invDx)), 0, int(int(sparseGridParams.subgridSizeX) - 2 * haloSize - 1));
|
|
gridBaseCoord.y = PxClamp(int(floor(localXp.y * invDx)), 0, int(int(sparseGridParams.subgridSizeY) - 2 * haloSize - 1));
|
|
gridBaseCoord.z = PxClamp(int(floor(localXp.z * invDx)), 0, int(int(sparseGridParams.subgridSizeZ) - 2 * haloSize - 1));
|
|
|
|
//Find the neighboring subgrids (step has values -1/0/1 for x/y/z) that need to exist
|
|
int3 step;
|
|
step.x = gridBaseCoord.x < neighborhoodSize ? -1 : (gridBaseCoord.x >= sparseGridParams.subgridSizeX - 2 * haloSize - neighborhoodSize ? 1 : 0);
|
|
step.y = gridBaseCoord.y < neighborhoodSize ? -1 : (gridBaseCoord.y >= sparseGridParams.subgridSizeY - 2 * haloSize - neighborhoodSize ? 1 : 0);
|
|
step.z = gridBaseCoord.z < neighborhoodSize ? -1 : (gridBaseCoord.z >= sparseGridParams.subgridSizeZ - 2 * haloSize - neighborhoodSize ? 1 : 0);
|
|
|
|
//Mark the neighbor subgrids that need to exist such that particles with a radius >0 near the subgrid boundary can transfer their density to the grid
|
|
PxU32 buffer[8];
|
|
int indexer = 0;
|
|
|
|
buffer[indexer++] = calcSubgridHash(subgridId);
|
|
|
|
if (step.x != 0 && step.y != 0 && step.z != 0) buffer[indexer++] = subgridHashOffset(subgridId, step.x, step.y, step.z);
|
|
|
|
if (step.x != 0 && step.y != 0) buffer[indexer++] = subgridHashOffset(subgridId, step.x, step.y, 0);
|
|
if (step.x != 0 && step.z != 0) buffer[indexer++] = subgridHashOffset(subgridId, step.x, 0, step.z);
|
|
if (step.y != 0 && step.z != 0) buffer[indexer++] = subgridHashOffset(subgridId, 0, step.y, step.z);
|
|
|
|
if (step.x != 0) buffer[indexer++] = subgridHashOffset(subgridId, step.x, 0, 0);
|
|
if (step.y != 0) buffer[indexer++] = subgridHashOffset(subgridId, 0, step.y, 0);
|
|
if (step.z != 0) buffer[indexer++] = subgridHashOffset(subgridId, 0, 0, step.z);
|
|
|
|
|
|
for (int j = 0; j < indexer; ++j)
|
|
applyMask(requiredNeighborMask, uniqueSortedHashkey, buffer[j], sparseGridParams.maxNumSubgrids);
|
|
}
|
|
|
|
extern "C" __global__ void sg_SparseGridSortedArrayToDelta(
|
|
const PxU32* in,
|
|
const PxU32* mask,
|
|
PxU32* out,
|
|
PxU32 n)
|
|
{
|
|
const PxU32 i = threadIdx.x + blockIdx.x * blockDim.x;
|
|
if (i < n)
|
|
{
|
|
if (i < n - 1 && in[i] != in[i + 1])
|
|
out[i] = mask ? mask[i] : 1;
|
|
else
|
|
out[i] = 0;
|
|
if (i == n - 1)
|
|
out[i] = mask ? mask[i] : 1;
|
|
}
|
|
}
|
|
|
|
extern "C" __global__ void sg_SparseGridGetUniqueValues(
|
|
const PxU32* sortedData,
|
|
const PxU32* indices,
|
|
PxU32* uniqueValues,
|
|
const PxU32 n,
|
|
PxU32* subgridNeighborCollector,
|
|
const PxU32 uniqueValuesSize)
|
|
{
|
|
const PxU32 i = threadIdx.x + blockIdx.x * blockDim.x;
|
|
if (i < n)
|
|
{
|
|
if (i == n - 1 || indices[i] != indices[i + 1])
|
|
{
|
|
if (indices[i] < uniqueValuesSize)
|
|
{
|
|
uniqueValues[indices[i]] = sortedData[i];
|
|
|
|
if (subgridNeighborCollector)
|
|
{
|
|
int4 id = subgridHashToId(sortedData[i]);
|
|
int indexer = 27 * indices[i];
|
|
for (int i = -1; i <= 1; ++i) for (int j = -1; j <= 1; ++j) for (int k = -1; k <= 1; ++k)
|
|
subgridNeighborCollector[indexer++] = calcSubgridHash(make_int3(id.x + i, id.y + j, id.z + k));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
extern "C" __global__ void sg_SparseGridClearDensity(
|
|
PxReal* PX_RESTRICT density,
|
|
const PxReal clearValue,
|
|
const PxU32* numActiveSubgrids,
|
|
const PxU32 subgridSize
|
|
)
|
|
{
|
|
const PxU32 idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (idx >= (numActiveSubgrids[0]) * subgridSize)
|
|
return;
|
|
|
|
density[idx] = clearValue;
|
|
}
|
|
|
|
extern "C" __global__ void sg_SparseGridBuildSubgridNeighbors(
|
|
const PxU32* PX_RESTRICT uniqueSortedHashkey,
|
|
const PxU32* PX_RESTRICT numActiveSubgrids,
|
|
const PxU32 maxNumSubgrids,
|
|
PxU32* PX_RESTRICT subgridNeighbors
|
|
)
|
|
{
|
|
const PxU32 si = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
if (si >= maxNumSubgrids)
|
|
return;
|
|
|
|
const PxU32 hash = uniqueSortedHashkey[si];
|
|
|
|
int4 sID = subgridHashToId(hash);
|
|
|
|
subgridNeighbors[27 * si + SUBGRID_CENTER_IDX] = si;
|
|
|
|
for (int z = -1; z <= 1; ++z) for (int y = -1; y <= 1; ++y) for (int x = -1; x <= 1; ++x)
|
|
{
|
|
const int3 nID = make_int3(sID.x + x, sID.y + y, sID.z + z);
|
|
const PxU32 nHash = calcSubgridHash(nID);
|
|
|
|
PxU32 n = EMPTY_SUBGRID;
|
|
if (isSubgridInsideRange(nID))
|
|
{
|
|
PxU32 nSortedIdx = 0;
|
|
if (tryFindHashkey(uniqueSortedHashkey, numActiveSubgrids[0]/* + 1*/, nHash, nSortedIdx))
|
|
n = nSortedIdx;
|
|
}
|
|
subgridNeighbors[27 * si + subgridNeighborIndex(x, y, z)] = n;
|
|
}
|
|
}
|
|
|
|
extern "C" __global__ void sg_MarkSubgridEndIndices(const PxU32* sortedParticleToSubgrid, PxU32 numParticles, PxU32* subgridEndIndices)
|
|
{
|
|
PxI32 threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (threadIndex >= numParticles)
|
|
return;
|
|
|
|
if (threadIndex < numParticles - 1)
|
|
{
|
|
if (sortedParticleToSubgrid[threadIndex] != sortedParticleToSubgrid[threadIndex + 1])
|
|
subgridEndIndices[sortedParticleToSubgrid[threadIndex]] = threadIndex + 1;
|
|
}
|
|
else
|
|
subgridEndIndices[sortedParticleToSubgrid[threadIndex]] = numParticles;
|
|
}
|
|
|
|
extern "C" __global__ void sg_ReuseSubgrids(
|
|
const PxSparseGridParams sparseGridParams,
|
|
const PxU32* uniqueHashkeysPerSubgridPreviousUpdate,
|
|
const PxU32* numActiveSubgridsPreviousUpdate,
|
|
PxU32* subgridOrderMapPreviousUpdate,
|
|
|
|
const PxU32* uniqueHashkeysPerSubgrid,
|
|
const PxU32* numActiveSubgrids,
|
|
PxU32* subgridOrderMap)
|
|
{
|
|
PxI32 threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (threadIndex >= sparseGridParams.maxNumSubgrids)
|
|
return;
|
|
|
|
|
|
if (threadIndex >= numActiveSubgrids[0])
|
|
{
|
|
subgridOrderMap[threadIndex] = EMPTY_SUBGRID;
|
|
return;
|
|
}
|
|
|
|
const PxU32 hashkey = uniqueHashkeysPerSubgrid[threadIndex];
|
|
PxU32 sortedIdx = 0;
|
|
const bool hashFound = tryFindHashkey(uniqueHashkeysPerSubgridPreviousUpdate, numActiveSubgridsPreviousUpdate[0], hashkey, sortedIdx);
|
|
if (!hashFound)
|
|
{
|
|
subgridOrderMap[threadIndex] = NEW_SUBGRID;
|
|
return;
|
|
}
|
|
|
|
subgridOrderMap[threadIndex] = subgridOrderMapPreviousUpdate[sortedIdx];
|
|
subgridOrderMapPreviousUpdate[sortedIdx] = REUSED_SUBGRID;
|
|
}
|
|
|
|
PX_FORCE_INLINE __device__ void addIdToUnusedSubgridStack(PxU32 idToAddToStack, PxU32* unusedSubgridStackSize, PxU32* unusedSubgridStack)
|
|
{
|
|
const PxU32 id = atomicAdd(unusedSubgridStackSize, 1);
|
|
unusedSubgridStack[id] = idToAddToStack;
|
|
}
|
|
|
|
PX_FORCE_INLINE __device__ PxU32 getSubgridIdFromUnusedStack(PxU32* unusedSubgridStackSize, PxU32* unusedSubgridStack)
|
|
{
|
|
const PxU32 id = PxU32(atomicAdd(reinterpret_cast<PxI32*>(unusedSubgridStackSize), -1));
|
|
return unusedSubgridStack[id - 1];
|
|
}
|
|
|
|
//TODO: This method uses atomics. For better debuging, it might be worth to offer a slower variant that generates 100% reproducible results
|
|
extern "C" __global__ void sg_AddReleasedSubgridsToUnusedStack(
|
|
const PxU32* numActiveSubgridsPreviousUpdate,
|
|
const PxU32* subgridOrderMapPreviousUpdate,
|
|
|
|
PxU32* unusedSubgridStackSize,
|
|
PxU32* unusedSubgridStack)
|
|
{
|
|
PxI32 threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (threadIndex >= numActiveSubgridsPreviousUpdate[0])
|
|
return;
|
|
|
|
if (subgridOrderMapPreviousUpdate[threadIndex] != REUSED_SUBGRID)
|
|
addIdToUnusedSubgridStack(subgridOrderMapPreviousUpdate[threadIndex], unusedSubgridStackSize, unusedSubgridStack);
|
|
}
|
|
|
|
//TODO: This method uses atomics. For better debuging, it might be worth to offer a slower variant that generates 100% reproducible results
|
|
extern "C" __global__ void sg_AllocateNewSubgrids(
|
|
const PxU32* numActiveSubgrids,
|
|
PxU32* subgridOrderMap,
|
|
|
|
PxU32* unusedSubgridStackSize,
|
|
PxU32* unusedSubgridStack,
|
|
|
|
const PxU32* numActiveSubgridsPreviousUpdate,
|
|
const PxU32 maxNumSubgrids)
|
|
{
|
|
PxI32 threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (threadIndex >= numActiveSubgrids[0])
|
|
return;
|
|
|
|
|
|
if (numActiveSubgridsPreviousUpdate[0] == 0)
|
|
{
|
|
PxU32 numActiveSubgridsClamped = PxMin(maxNumSubgrids, numActiveSubgrids[0]);
|
|
|
|
//Special case to simplify debugging: If no subgrids were active in the previous frame, then all subgrids present now must be new
|
|
//Make sure that the subgrid indices in the first frame are always identical. But the order might change in subsequent frames due to the use of atomics
|
|
//subgridOrderMap[threadIndex] = unusedSubgridStack[maxNumSubgrids - threadIndex - 1];
|
|
subgridOrderMap[threadIndex] = unusedSubgridStack[maxNumSubgrids - numActiveSubgridsClamped + threadIndex]; //Use this line to test with non-default subgrid order to ensure that the code does not only work with the default order
|
|
if (threadIndex == 0)
|
|
unusedSubgridStackSize[0] -= numActiveSubgridsClamped;
|
|
//If launched with 1024 threads per block, one could do per block scan and support 100% reproducible subgrid allocations using a block scan if maxNumSubgrids<=1024
|
|
}
|
|
else
|
|
{
|
|
if (subgridOrderMap[threadIndex] == NEW_SUBGRID)
|
|
{
|
|
subgridOrderMap[threadIndex] = getSubgridIdFromUnusedStack(unusedSubgridStackSize, unusedSubgridStack);
|
|
}
|
|
}
|
|
}
|
|
|
|
|