// Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved. // Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. // Copyright (c) 2001-2004 NovodeX AG. All rights reserved. #include "vector_types.h" #include "foundation/PxVec3.h" #include "foundation/PxVec4.h" #include "foundation/PxBounds3.h" #include "PxgParticleSystemCore.h" #include "PxgParticleSystem.h" #include "PxgParticleSystemCoreKernelIndices.h" #include "PxgBodySim.h" #include "PxgCommonDefines.h" #include "reduction.cuh" #include "shuffle.cuh" #include "stdio.h" #include "PxgSolverBody.h" #include "PxgSolverCoreDesc.h" #include "PxParticleSystem.h" #include "assert.h" #include "copy.cuh" #include "PxgSimulationCoreDesc.h" #include "gridCal.cuh" #include "particleSystem.cuh" #include "atomic.cuh" #include "utils.cuh" using namespace physx; // simpler kernel for diffuse weighting __device__ inline PxReal WDiffuse(const PxReal h, const PxReal invR) { return (1.0f - h * invR); } extern "C" __host__ void initDiffuseParticlesKernels0() {} extern "C" __global__ void ps_updateUnsortedDiffuseArrayLaunch( const PxgParticleSystem * PX_RESTRICT particleSystems, const PxU32 * PX_RESTRICT activeParticleSystems) { const PxU32 particleId = activeParticleSystems[blockIdx.z]; const PxgParticleSystem& particleSystem = particleSystems[particleId]; const PxU32 bufferIndex = blockIdx.y; if (bufferIndex < particleSystem.mNumDiffuseBuffers) { const PxU32 threadIndexInWarp = threadIdx.x & 31; float4* PX_RESTRICT unsortedPositions = reinterpret_cast(particleSystem.mDiffusePosition_LifeTime); float4* PX_RESTRICT unsortedVels = reinterpret_cast(particleSystem.mDiffuseVelocity); PxU32 localSum = 0; for (PxU32 i = threadIndexInWarp; i < bufferIndex; i += WARP_SIZE) { localSum += particleSystem.mDiffuseSimBuffers[i].mNumDiffuseParticles[0]; } PxU32 bufferOffset = warpReduction(FULL_MASK, localSum); PxgParticleDiffuseSimBuffer& buffer = particleSystem.mDiffuseSimBuffers[bufferIndex]; int numDiffuseParticles = buffer.mNumDiffuseParticles[0]; const float4* particles = buffer.mDiffusePositions_LifeTime; const float4* vels = buffer.mDiffuseVelocities; const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x; if (globalThreadIndex >= numDiffuseParticles) return; if (globalThreadIndex == 0) { buffer.mStartIndex = bufferOffset; } const PxU32 ind = bufferOffset + globalThreadIndex; unsortedPositions[ind] = particles[globalThreadIndex]; unsortedVels[ind] = vels[globalThreadIndex]; } } extern "C" __global__ void ps_diffuseParticleOneWayCollision( PxgParticleSystem * PX_RESTRICT particleSystems, const PxU32* PX_RESTRICT activeParticleSystems, const PxU32 count ) { __shared__ __align__(16) PxU8 particleSystemMemory[sizeof(PxgParticleSystem)]; PxgParticleSystem& shParticleSystem = *(reinterpret_cast(particleSystemMemory)); const PxU32 id = activeParticleSystems[blockIdx.y]; const PxgParticleSystem& particleSystem = particleSystems[id]; const uint2* sParticleSystem = reinterpret_cast(&particleSystem); uint2* dParticleSystem = reinterpret_cast(&shParticleSystem); blockCopy(dParticleSystem, sParticleSystem, sizeof(PxgParticleSystem)); __syncthreads(); const PxU32 pi = threadIdx.x + blockIdx.x * blockDim.x; const PxU32 numParticles = *shParticleSystem.mNumDiffuseParticles; if (pi >= numParticles) return; float4* PX_RESTRICT newPos = reinterpret_cast(shParticleSystem.mDiffuseSortedPos_LifeTime); const PxgParticleContactInfo* PX_RESTRICT contacts = shParticleSystem.mDiffuseOneWayContactInfos; const PxU32* PX_RESTRICT contactCounts = shParticleSystem.mDiffuseOneWayContactCount; const PxU32 contactCount = PxMin(PxgParticleContactInfo::MaxStaticContactsPerParticle, contactCounts[pi]); if (contactCount) { PxVec3 posCorr = PxLoad3(newPos[pi]); for (PxU32 c = 0, offset = pi; c < contactCount; ++c, offset += numParticles) { const PxgParticleContactInfo& contact = contacts[offset]; const PxVec3 surfaceNormal = PxLoad3(contact.mNormal_PenW); const PxVec3 deltaP = -surfaceNormal * contact.mNormal_PenW.w; posCorr += deltaP; } newPos[pi] = make_float4(posCorr.x, posCorr.y, posCorr.z, newPos[pi].w); } } extern "C" __global__ void ps_diffuseParticleUpdatePBF( PxgParticleSystem* PX_RESTRICT particleSystems, const PxU32* activeParticleSystems, const PxVec3 gravity, const PxReal dt) { __shared__ __align__(16) PxU8 particleSystemMemory[sizeof(PxgParticleSystem)]; PxgParticleSystem& shParticleSystem = *(reinterpret_cast(particleSystemMemory)); __shared__ int offset[3]; if (threadIdx.x == 0) { offset[0] = 0; offset[1] = -1; offset[2] = 1; } const PxU32 id = activeParticleSystems[blockIdx.y]; const PxgParticleSystem& particleSystem = particleSystems[id]; const uint2* sParticleSystem = reinterpret_cast(&particleSystem); uint2* dParticleSystem = reinterpret_cast(&shParticleSystem); blockCopy(dParticleSystem, sParticleSystem, sizeof(PxgParticleSystem)); __syncthreads(); { int numDiffuse = *shParticleSystem.mNumDiffuseParticles; const PxU32 pi = threadIdx.x + blockIdx.x * blockDim.x; if (pi >= numDiffuse) return; const PxU32* const PX_RESTRICT cellStarts = shParticleSystem.mCellStart; const PxU32* const PX_RESTRICT cellEnds = shParticleSystem.mCellEnd; // per-particle data const float4* const PX_RESTRICT sortedPose = reinterpret_cast(shParticleSystem.mSortedPositions_InvMass); const float4* const PX_RESTRICT sortedVel = reinterpret_cast(shParticleSystem.mSortedVelocities); float4* PX_RESTRICT diffusePositions = reinterpret_cast(shParticleSystem.mDiffuseSortedPos_LifeTime); //Overloading this buffer to store the new velocity... float4* PX_RESTRICT newVel = reinterpret_cast(shParticleSystem.mDiffuseSortedOriginPos_LifeTime); // get elements const float4 xi4 = diffusePositions[pi]; const PxVec3 pos = PxLoad3(xi4); // interpolate PxVec3 velAvg(PxZero); PxU32 numNeighbors = 0; const PxReal cellWidth = shParticleSystem.mCommonData.mGridCellWidth; const PxReal contactDistanceSq = shParticleSystem.mCommonData.mParticleContactDistanceSq; const PxReal invContactDistance = shParticleSystem.mCommonData.mParticleContactDistanceInv; const int3 gridPos = calcGridPos(xi4, cellWidth); const uint3 gridSize = make_uint3(shParticleSystem.mCommonData.mGridSizeX, shParticleSystem.mCommonData.mGridSizeY, shParticleSystem.mCommonData.mGridSizeZ); // Iterate over cell PxReal weightSum = 0.0f; PxVec3 velocitySum(0.f); const PxU32 maxNeighbors = 16; const PxU32 end = (shParticleSystem.mData.mFlags & PxParticleFlag::eFULL_DIFFUSE_ADVECTION) ? 3 : 1; for (int z = 0; z < end; ++z) for (int y = 0; y < end; ++y) for (int x = 0; x < end; ++x) { const int3 neighbourPos = make_int3(gridPos.x + offset[x], gridPos.y + offset[y], gridPos.z + offset[z]); const PxU32 gridHash = calcGridHash(neighbourPos, gridSize); const PxU32 startIndex = cellStarts[gridHash]; if (startIndex != EMPTY_CELL) { const PxU32 endIndex = cellEnds[gridHash]; for (PxU32 q = startIndex; q < endIndex; ++q) { const PxVec3 xj = PxLoad3(sortedPose[q]); const PxVec3 xij = pos - xj; const PxReal dSq = xij.dot(xij); if (dSq < contactDistanceSq) { const PxVec3 vj = PxLoad3(sortedVel[q]); const PxReal w = WDiffuse(sqrtf(dSq), invContactDistance); weightSum += w; velocitySum += vj * w; ++numNeighbors; if (numNeighbors == maxNeighbors) goto weight_sum; } } } } weight_sum: if (weightSum > 0) velAvg = velocitySum / weightSum; newVel[pi] = make_float4(velAvg.x, velAvg.y, velAvg.z, PxReal(numNeighbors)); } } extern "C" __global__ void ps_diffuseParticleCompact( PxgParticleSystem* PX_RESTRICT particleSystems, const PxU32* activeParticleSystems, const PxVec3 gravity, const PxReal dt) { __shared__ __align__(16) PxU8 particleSystemMemory[sizeof(PxgParticleSystem)]; PxgParticleSystem& shParticleSystem = *(reinterpret_cast(particleSystemMemory)); const PxU32 id = activeParticleSystems[blockIdx.z]; const PxgParticleSystem& particleSystem = particleSystems[id]; const uint2* sParticleSystem = reinterpret_cast(&particleSystem); uint2* dParticleSystem = reinterpret_cast(&shParticleSystem); blockCopy(dParticleSystem, sParticleSystem, sizeof(PxgParticleSystem)); __syncthreads(); const PxU32 bufferIndex = blockIdx.y; if (bufferIndex < shParticleSystem.mNumDiffuseBuffers) { PxgParticleDiffuseSimBuffer& buffer = shParticleSystem.mDiffuseSimBuffers[bufferIndex]; int* numDiffuseParticles = buffer.mNumDiffuseParticles; int numDiffuse = numDiffuseParticles[0]; const PxU32 pi = threadIdx.x + blockIdx.x * blockDim.x; const PxU32 threadIndexInWarp = threadIdx.x & 31; if (pi >= numDiffuse) return; float4* PX_RESTRICT diffusePositionsNew = buffer.mDiffusePositions_LifeTime; float4* PX_RESTRICT diffuseVelocitiesNew = buffer.mDiffuseVelocities; float4* PX_RESTRICT velAvgs = reinterpret_cast(shParticleSystem.mDiffuseSortedOriginPos_LifeTime); float4* PX_RESTRICT diffusePositions = reinterpret_cast(shParticleSystem.mDiffuseSortedPos_LifeTime); float4* PX_RESTRICT diffusePositionsOld = reinterpret_cast(shParticleSystem.mDiffuseOriginPos_LifeTime); const PxU32* reverseLookup = shParticleSystem.mDiffuseUnsortedToSortedMapping; const PxU32 index = pi + buffer.mStartIndex; const PxU32 sortedInd = reverseLookup[index]; // get elements const float4 xi4 = diffusePositions[sortedInd]; const float4 vi4Old = diffusePositionsOld[index]; const float4 xiva4 = velAvgs[sortedInd]; const PxVec3 pos = PxLoad3(xi4); const PxVec3 oldPos = PxLoad3(vi4Old); const PxVec3 velAvg = PxLoad3(xiva4); const PxReal lifeDelta = dt; PxVec3 vel = (pos - oldPos)*(1.f / dt); // integrate diffuse particle PxVec3 newVel; if (xiva4.w < 4.f) { // spray (ballistic) newVel = vel * (1.0f - buffer.mParams.airDrag * dt); } else if (xiva4.w < 8.f) { // foam newVel = velAvg; } else { // bubble newVel = vel - (1.f + buffer.mParams.buoyancy) * gravity * dt + buffer.mParams.bubbleDrag * (velAvg - vel); } const float maxVel = shParticleSystem.mData.mMaxVelocity; if (newVel.magnitudeSquared() > 0) { newVel = PxMin(newVel.magnitude(), maxVel) * newVel.getNormalized(); } PxVec3 newPosCorr = pos + (newVel - vel) * dt; PxVec3 newVelCorr = newVel; __syncwarp(); const PxReal lifeTime = fmaxf(xi4.w - lifeDelta, 0.0f); PxU32 res = __ballot_sync(FULL_MASK, lifeTime > 0.f); PxU32 offset = 0; if (threadIndexInWarp == 0) offset = atomicAdd(&numDiffuseParticles[1], __popc(res)); offset = __shfl_sync(FULL_MASK, offset, 0); if (lifeTime > 0.f) { PxU32 newIndex = offset + warpScanExclusive(res, threadIndexInWarp); diffusePositionsNew[newIndex] = make_float4(newPosCorr.x, newPosCorr.y, newPosCorr.z, lifeTime); diffuseVelocitiesNew[newIndex] = make_float4(newVelCorr.x, newVelCorr.y, newVelCorr.z, 0.0f); } } } extern "C" __global__ void ps_diffuseParticleCreate( PxgParticleSystem * PX_RESTRICT particleSystems, const PxU32* const PX_RESTRICT activeParticleSystems, const PxReal* const PX_RESTRICT randomTable, const PxU32 randomTableSize, const PxReal dt) { __shared__ __align__(16) PxU8 particleSystemMemory[sizeof(PxgParticleSystem)]; PxgParticleSystem& shParticleSystem = *(reinterpret_cast(particleSystemMemory)); const PxU32 id = activeParticleSystems[blockIdx.z]; const PxgParticleSystem& particleSystem = particleSystems[id]; const uint2* sParticleSystem = reinterpret_cast(&particleSystem); uint2* dParticleSystem = reinterpret_cast(&shParticleSystem); blockCopy(dParticleSystem, sParticleSystem, sizeof(PxgParticleSystem)); __syncthreads(); const PxU32 bufferIndex = blockIdx.y; if (bufferIndex < shParticleSystem.mCommonData.mNumParticleBuffers) { const PxgParticleSimBuffer& buffer = shParticleSystem.mParticleSimBuffers[bufferIndex]; const PxU32 diffuseParticleBufferIndex = buffer.mDiffuseParticleBufferIndex; if (diffuseParticleBufferIndex == 0xffffffff) return; const PxgParticleSystemData& data = shParticleSystem.mData; const PxU32 pi = threadIdx.x + blockIdx.x * blockDim.x; const PxU32 numParticles = buffer.mNumActiveParticles; if (pi >= numParticles) return; PxgParticleDiffuseSimBuffer& diffuseBuffer = shParticleSystem.mDiffuseSimBuffers[diffuseParticleBufferIndex]; if (diffuseBuffer.mMaxNumParticles == 0) return; // get arrays const float4* const PX_RESTRICT sortedPose = reinterpret_cast(shParticleSystem.mSortedPositions_InvMass); const float4* const PX_RESTRICT sortedVel = reinterpret_cast(shParticleSystem.mSortedVelocities); const PxU32* PX_RESTRICT phases = shParticleSystem.mSortedPhaseArray; const float2* const PX_RESTRICT potentials = reinterpret_cast(shParticleSystem.mDiffusePotentials); float4* PX_RESTRICT diffusePositionsNew = diffuseBuffer.mDiffusePositions_LifeTime; float4* PX_RESTRICT diffuseVelocitiesNew = diffuseBuffer.mDiffuseVelocities; int* numDiffuseParticles = diffuseBuffer.mNumDiffuseParticles; const PxU32* reverseLookup = shParticleSystem.mUnsortedToSortedMapping; const PxU32 offset = particleSystem.mParticleBufferRunsum[bufferIndex]; const PxU32 sortedInd = reverseLookup[pi + offset]; // get elements const float2 ptnts = potentials[sortedInd]; const PxReal threshold = diffuseBuffer.mParams.threshold; const PxU32 phase = phases[sortedInd]; if (!PxGetFluid(phase)) return; const float4 vi4 = sortedVel[sortedInd]; //Kinetic energy + pressure const PxReal kineticEnergy = dot3(vi4, vi4) * diffuseBuffer.mParams.kineticEnergyWeight; const PxReal divergence = diffuseBuffer.mParams.divergenceWeight * ptnts.x; const PxReal pressure = diffuseBuffer.mParams.pressureWeight * ptnts.y; PxReal intensity = pressure - divergence + kineticEnergy; //if (pi == 0) // printf("numParticles %i diffuseParticleBufferIndex %i numDiffuseParticles[1] %i threshold %f\n", numParticles, diffuseParticleBufferIndex, numDiffuseParticles[1], threshold); const PxReal r0 = randomTable[(sortedInd + 0) % randomTableSize]; if(r0 * intensity > threshold) { const float4 xi4 = sortedPose[sortedInd]; //for (int i=0; i < 5; ++i) { // try and allocate new diffuse particles const int newIndex = atomicAdd(&numDiffuseParticles[1], 1); if (newIndex < diffuseBuffer.mMaxNumParticles) { const PxVec3 xi = PxLoad3(xi4); const PxVec3 vi = PxLoad3(vi4); const PxReal r1 = randomTable[(sortedInd + 1) % randomTableSize]; const PxReal r2 = randomTable[(sortedInd + 2) % randomTableSize]; const PxReal r3 = randomTable[(sortedInd + 3) % randomTableSize]; const PxReal lifeMin = 1.0f; const PxReal lifeMax = diffuseBuffer.mParams.lifetime; const PxReal lifeScale = fminf(intensity / threshold, 1.f) * r1; const PxReal lifetime = lifeMin + lifeScale * (lifeMax - lifeMin); const PxVec3 q = xi - r2 * vi * dt + PxVec3(r1, r2, r3) * data.mRestOffset * 0.25f; diffusePositionsNew[newIndex] = make_float4(q.x, q.y, q.z, lifetime); diffuseVelocitiesNew[newIndex] = make_float4(vi.x, vi.y, vi.z, 0.0f); } } } } } extern "C" __global__ void ps_diffuseParticleCopy( PxgParticleSystem * PX_RESTRICT particleSystems, const PxU32* const PX_RESTRICT activeParticleSystems, const PxU32 count) { const PxU32 id = activeParticleSystems[blockIdx.z]; PxgParticleSystem& particleSystem = particleSystems[id]; const PxU32 numDiffuseBuffers = particleSystem.mNumDiffuseBuffers; const PxU32 bufferIndex = blockIdx.y; if (bufferIndex < numDiffuseBuffers) { PxgParticleDiffuseSimBuffer& diffuseBuffer = particleSystem.mDiffuseSimBuffers[bufferIndex]; int* numDiffuseParticles = diffuseBuffer.mNumDiffuseParticles; const PxU32 numDiffuse = PxMin(PxI32(diffuseBuffer.mMaxNumParticles), numDiffuseParticles[1]); *diffuseBuffer.mNumActiveDiffuseParticles = numDiffuse; //pinned memory numDiffuseParticles[0] = numDiffuse; numDiffuseParticles[1] = 0; } } extern "C" __global__ void ps_diffuseParticleSum( PxgParticleSystem * PX_RESTRICT particleSystems, const PxU32* const PX_RESTRICT activeParticleSystems, const PxU32 count) { const PxU32 id = activeParticleSystems[blockIdx.x]; PxgParticleSystem& particleSystem = particleSystems[id]; const PxU32 numDiffuseBuffers = particleSystem.mNumDiffuseBuffers; PxU32 totalDiffuse = 0; for (PxU32 i = threadIdx.x; i < numDiffuseBuffers; i += WARP_SIZE) { PxgParticleDiffuseSimBuffer& diffuseBuffer = particleSystem.mDiffuseSimBuffers[i]; totalDiffuse += diffuseBuffer.mNumDiffuseParticles[0]; } totalDiffuse = warpReduction(FULL_MASK, totalDiffuse); if(threadIdx.x == 0) { *particleSystem.mNumDiffuseParticles = totalDiffuse; } }