// Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved. // Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. // Copyright (c) 2001-2004 NovodeX AG. All rights reserved. #include "vector_types.h" #include "foundation/PxVec3.h" #include "foundation/PxVec4.h" #include "stdio.h" #include "assert.h" #include "cuda.h" #include "sparseGridStandalone.cuh" #define ENABLE_KERNEL_LAUNCH_ERROR_CHECK 0 #define NEW_SUBGRID 0xfffffffe #define REUSED_SUBGRID 0xfffffffd extern "C" __host__ void initSparseGridStandaloneKernels0() {} extern "C" __global__ void sg_SparseGridCalcSubgridHashes( PxSparseGridParams sparseGridParams, PxU32* PX_RESTRICT indices, PxU32* PX_RESTRICT hashkeyPerParticle, const PxVec4* const PX_RESTRICT positions, const PxU32 numParticles, const PxU32* PX_RESTRICT phases, const PxU32 validPhaseMask, const PxU32* PX_RESTRICT activeIndices) { PxU32 p = threadIdx.x + blockIdx.x * blockDim.x; if (p >= numParticles) return; if (activeIndices) p = activeIndices[p]; const PxVec3 subgridDomainSize = getSubgridDomainSize(sparseGridParams, 0/*sparseGridParams.haloSize*/); const PxVec3 pos = positions[p].getXYZ(); const int3 subgridId = calcSubgridId(pos, subgridDomainSize); bool isValidPhase = phases == NULL || (phases[p] & validPhaseMask); indices[p] = p; hashkeyPerParticle[p] = isValidPhase ? calcSubgridHash(subgridId) : EMPTY_SUBGRID; } __device__ void applyMask(PxU32* mask, const PxU32* PX_RESTRICT uniqueSortedHashkey, PxU32 hashkey, PxU32 maxNumSubgrids) { if (hashkey == EMPTY_SUBGRID) return; PxU32 sortedIdx = 0; const bool hashFound = tryFindHashkey(uniqueSortedHashkey, 27 * maxNumSubgrids, hashkey, sortedIdx); if (hashFound) { if (mask[sortedIdx] == 1) return; //Was already marked by another thread mask[sortedIdx] = 1; int i = sortedIdx - 1; while (i >= 0 && uniqueSortedHashkey[i] == hashkey) mask[i--] = 1; i = sortedIdx + 1; while (i < 27 * maxNumSubgrids && uniqueSortedHashkey[i] == hashkey) mask[i++] = 1; } } extern "C" __global__ void sg_SparseGridMarkRequiredNeighbors( PxU32* requiredNeighborMask, const PxU32* PX_RESTRICT uniqueSortedHashkey, const PxSparseGridParams sparseGridParams, PxU32 neighborhoodSize, const PxVec4* particlePositions, const PxU32 numParticles, const PxU32* PX_RESTRICT phases, const PxU32 validPhaseMask, const PxU32* PX_RESTRICT activeIndices) { PxU32 i = threadIdx.x + blockIdx.x * blockDim.x; if (i >= numParticles) return; if (activeIndices) i = activeIndices[i]; if (phases && !(phases[i] & validPhaseMask)) return; //Avoid to allocate sparse grids in regions of non-fluid particles const PxVec3 xp = particlePositions[i].getXYZ(); const PxU32 haloSize = 0; // sparseGridParams.haloSize; const PxVec3 subgridDomainSize = getSubgridDomainSize(sparseGridParams, haloSize); const int3 subgridId = calcSubgridId(xp, subgridDomainSize); //subgridIdsPerParticle[i]; // flipSubgridHashToId(hashkey); const PxReal dx = sparseGridParams.gridSpacing; const PxReal invDx = 1.0f / dx; const PxVec3 subgridOrigin = PxVec3( subgridId.x * dx * (sparseGridParams.subgridSizeX - 2 * haloSize), subgridId.y * dx * (sparseGridParams.subgridSizeY - 2 * haloSize), subgridId.z * dx * (sparseGridParams.subgridSizeZ - 2 * haloSize)); const PxVec3 localXp = xp - subgridOrigin; int3 gridBaseCoord; gridBaseCoord.x = PxClamp(int(floor(localXp.x * invDx)), 0, int(int(sparseGridParams.subgridSizeX) - 2 * haloSize - 1)); gridBaseCoord.y = PxClamp(int(floor(localXp.y * invDx)), 0, int(int(sparseGridParams.subgridSizeY) - 2 * haloSize - 1)); gridBaseCoord.z = PxClamp(int(floor(localXp.z * invDx)), 0, int(int(sparseGridParams.subgridSizeZ) - 2 * haloSize - 1)); //Find the neighboring subgrids (step has values -1/0/1 for x/y/z) that need to exist int3 step; step.x = gridBaseCoord.x < neighborhoodSize ? -1 : (gridBaseCoord.x >= sparseGridParams.subgridSizeX - 2 * haloSize - neighborhoodSize ? 1 : 0); step.y = gridBaseCoord.y < neighborhoodSize ? -1 : (gridBaseCoord.y >= sparseGridParams.subgridSizeY - 2 * haloSize - neighborhoodSize ? 1 : 0); step.z = gridBaseCoord.z < neighborhoodSize ? -1 : (gridBaseCoord.z >= sparseGridParams.subgridSizeZ - 2 * haloSize - neighborhoodSize ? 1 : 0); //Mark the neighbor subgrids that need to exist such that particles with a radius >0 near the subgrid boundary can transfer their density to the grid PxU32 buffer[8]; int indexer = 0; buffer[indexer++] = calcSubgridHash(subgridId); if (step.x != 0 && step.y != 0 && step.z != 0) buffer[indexer++] = subgridHashOffset(subgridId, step.x, step.y, step.z); if (step.x != 0 && step.y != 0) buffer[indexer++] = subgridHashOffset(subgridId, step.x, step.y, 0); if (step.x != 0 && step.z != 0) buffer[indexer++] = subgridHashOffset(subgridId, step.x, 0, step.z); if (step.y != 0 && step.z != 0) buffer[indexer++] = subgridHashOffset(subgridId, 0, step.y, step.z); if (step.x != 0) buffer[indexer++] = subgridHashOffset(subgridId, step.x, 0, 0); if (step.y != 0) buffer[indexer++] = subgridHashOffset(subgridId, 0, step.y, 0); if (step.z != 0) buffer[indexer++] = subgridHashOffset(subgridId, 0, 0, step.z); for (int j = 0; j < indexer; ++j) applyMask(requiredNeighborMask, uniqueSortedHashkey, buffer[j], sparseGridParams.maxNumSubgrids); } extern "C" __global__ void sg_SparseGridSortedArrayToDelta( const PxU32* in, const PxU32* mask, PxU32* out, PxU32 n) { const PxU32 i = threadIdx.x + blockIdx.x * blockDim.x; if (i < n) { if (i < n - 1 && in[i] != in[i + 1]) out[i] = mask ? mask[i] : 1; else out[i] = 0; if (i == n - 1) out[i] = mask ? mask[i] : 1; } } extern "C" __global__ void sg_SparseGridGetUniqueValues( const PxU32* sortedData, const PxU32* indices, PxU32* uniqueValues, const PxU32 n, PxU32* subgridNeighborCollector, const PxU32 uniqueValuesSize) { const PxU32 i = threadIdx.x + blockIdx.x * blockDim.x; if (i < n) { if (i == n - 1 || indices[i] != indices[i + 1]) { if (indices[i] < uniqueValuesSize) { uniqueValues[indices[i]] = sortedData[i]; if (subgridNeighborCollector) { int4 id = subgridHashToId(sortedData[i]); int indexer = 27 * indices[i]; for (int i = -1; i <= 1; ++i) for (int j = -1; j <= 1; ++j) for (int k = -1; k <= 1; ++k) subgridNeighborCollector[indexer++] = calcSubgridHash(make_int3(id.x + i, id.y + j, id.z + k)); } } } } } extern "C" __global__ void sg_SparseGridClearDensity( PxReal* PX_RESTRICT density, const PxReal clearValue, const PxU32* numActiveSubgrids, const PxU32 subgridSize ) { const PxU32 idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= (numActiveSubgrids[0]) * subgridSize) return; density[idx] = clearValue; } extern "C" __global__ void sg_SparseGridBuildSubgridNeighbors( const PxU32* PX_RESTRICT uniqueSortedHashkey, const PxU32* PX_RESTRICT numActiveSubgrids, const PxU32 maxNumSubgrids, PxU32* PX_RESTRICT subgridNeighbors ) { const PxU32 si = blockIdx.x * blockDim.x + threadIdx.x; if (si >= maxNumSubgrids) return; const PxU32 hash = uniqueSortedHashkey[si]; int4 sID = subgridHashToId(hash); subgridNeighbors[27 * si + SUBGRID_CENTER_IDX] = si; for (int z = -1; z <= 1; ++z) for (int y = -1; y <= 1; ++y) for (int x = -1; x <= 1; ++x) { const int3 nID = make_int3(sID.x + x, sID.y + y, sID.z + z); const PxU32 nHash = calcSubgridHash(nID); PxU32 n = EMPTY_SUBGRID; if (isSubgridInsideRange(nID)) { PxU32 nSortedIdx = 0; if (tryFindHashkey(uniqueSortedHashkey, numActiveSubgrids[0]/* + 1*/, nHash, nSortedIdx)) n = nSortedIdx; } subgridNeighbors[27 * si + subgridNeighborIndex(x, y, z)] = n; } } extern "C" __global__ void sg_MarkSubgridEndIndices(const PxU32* sortedParticleToSubgrid, PxU32 numParticles, PxU32* subgridEndIndices) { PxI32 threadIndex = blockIdx.x * blockDim.x + threadIdx.x; if (threadIndex >= numParticles) return; if (threadIndex < numParticles - 1) { if (sortedParticleToSubgrid[threadIndex] != sortedParticleToSubgrid[threadIndex + 1]) subgridEndIndices[sortedParticleToSubgrid[threadIndex]] = threadIndex + 1; } else subgridEndIndices[sortedParticleToSubgrid[threadIndex]] = numParticles; } extern "C" __global__ void sg_ReuseSubgrids( const PxSparseGridParams sparseGridParams, const PxU32* uniqueHashkeysPerSubgridPreviousUpdate, const PxU32* numActiveSubgridsPreviousUpdate, PxU32* subgridOrderMapPreviousUpdate, const PxU32* uniqueHashkeysPerSubgrid, const PxU32* numActiveSubgrids, PxU32* subgridOrderMap) { PxI32 threadIndex = blockIdx.x * blockDim.x + threadIdx.x; if (threadIndex >= sparseGridParams.maxNumSubgrids) return; if (threadIndex >= numActiveSubgrids[0]) { subgridOrderMap[threadIndex] = EMPTY_SUBGRID; return; } const PxU32 hashkey = uniqueHashkeysPerSubgrid[threadIndex]; PxU32 sortedIdx = 0; const bool hashFound = tryFindHashkey(uniqueHashkeysPerSubgridPreviousUpdate, numActiveSubgridsPreviousUpdate[0], hashkey, sortedIdx); if (!hashFound) { subgridOrderMap[threadIndex] = NEW_SUBGRID; return; } subgridOrderMap[threadIndex] = subgridOrderMapPreviousUpdate[sortedIdx]; subgridOrderMapPreviousUpdate[sortedIdx] = REUSED_SUBGRID; } PX_FORCE_INLINE __device__ void addIdToUnusedSubgridStack(PxU32 idToAddToStack, PxU32* unusedSubgridStackSize, PxU32* unusedSubgridStack) { const PxU32 id = atomicAdd(unusedSubgridStackSize, 1); unusedSubgridStack[id] = idToAddToStack; } PX_FORCE_INLINE __device__ PxU32 getSubgridIdFromUnusedStack(PxU32* unusedSubgridStackSize, PxU32* unusedSubgridStack) { const PxU32 id = PxU32(atomicAdd(reinterpret_cast(unusedSubgridStackSize), -1)); return unusedSubgridStack[id - 1]; } //TODO: This method uses atomics. For better debuging, it might be worth to offer a slower variant that generates 100% reproducible results extern "C" __global__ void sg_AddReleasedSubgridsToUnusedStack( const PxU32* numActiveSubgridsPreviousUpdate, const PxU32* subgridOrderMapPreviousUpdate, PxU32* unusedSubgridStackSize, PxU32* unusedSubgridStack) { PxI32 threadIndex = blockIdx.x * blockDim.x + threadIdx.x; if (threadIndex >= numActiveSubgridsPreviousUpdate[0]) return; if (subgridOrderMapPreviousUpdate[threadIndex] != REUSED_SUBGRID) addIdToUnusedSubgridStack(subgridOrderMapPreviousUpdate[threadIndex], unusedSubgridStackSize, unusedSubgridStack); } //TODO: This method uses atomics. For better debuging, it might be worth to offer a slower variant that generates 100% reproducible results extern "C" __global__ void sg_AllocateNewSubgrids( const PxU32* numActiveSubgrids, PxU32* subgridOrderMap, PxU32* unusedSubgridStackSize, PxU32* unusedSubgridStack, const PxU32* numActiveSubgridsPreviousUpdate, const PxU32 maxNumSubgrids) { PxI32 threadIndex = blockIdx.x * blockDim.x + threadIdx.x; if (threadIndex >= numActiveSubgrids[0]) return; if (numActiveSubgridsPreviousUpdate[0] == 0) { PxU32 numActiveSubgridsClamped = PxMin(maxNumSubgrids, numActiveSubgrids[0]); //Special case to simplify debugging: If no subgrids were active in the previous frame, then all subgrids present now must be new //Make sure that the subgrid indices in the first frame are always identical. But the order might change in subsequent frames due to the use of atomics //subgridOrderMap[threadIndex] = unusedSubgridStack[maxNumSubgrids - threadIndex - 1]; subgridOrderMap[threadIndex] = unusedSubgridStack[maxNumSubgrids - numActiveSubgridsClamped + threadIndex]; //Use this line to test with non-default subgrid order to ensure that the code does not only work with the default order if (threadIndex == 0) unusedSubgridStackSize[0] -= numActiveSubgridsClamped; //If launched with 1024 threads per block, one could do per block scan and support 100% reproducible subgrid allocations using a block scan if maxNumSubgrids<=1024 } else { if (subgridOrderMap[threadIndex] == NEW_SUBGRID) { subgridOrderMap[threadIndex] = getSubgridIdFromUnusedStack(unusedSubgridStackSize, unusedSubgridStack); } } }