Files
XCEngine/engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/diffuseParticles.cu

553 lines
18 KiB
Plaintext
Raw Normal View History

// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#include "vector_types.h"
#include "foundation/PxVec3.h"
#include "foundation/PxVec4.h"
#include "foundation/PxBounds3.h"
#include "PxgParticleSystemCore.h"
#include "PxgParticleSystem.h"
#include "PxgParticleSystemCoreKernelIndices.h"
#include "PxgBodySim.h"
#include "PxgCommonDefines.h"
#include "reduction.cuh"
#include "shuffle.cuh"
#include "stdio.h"
#include "PxgSolverBody.h"
#include "PxgSolverCoreDesc.h"
#include "PxParticleSystem.h"
#include "assert.h"
#include "copy.cuh"
#include "PxgSimulationCoreDesc.h"
#include "gridCal.cuh"
#include "particleSystem.cuh"
#include "atomic.cuh"
#include "utils.cuh"
using namespace physx;
// simpler kernel for diffuse weighting
__device__ inline PxReal WDiffuse(const PxReal h, const PxReal invR)
{
return (1.0f - h * invR);
}
extern "C" __host__ void initDiffuseParticlesKernels0() {}
extern "C" __global__ void ps_updateUnsortedDiffuseArrayLaunch(
const PxgParticleSystem * PX_RESTRICT particleSystems,
const PxU32 * PX_RESTRICT activeParticleSystems)
{
const PxU32 particleId = activeParticleSystems[blockIdx.z];
const PxgParticleSystem& particleSystem = particleSystems[particleId];
const PxU32 bufferIndex = blockIdx.y;
if (bufferIndex < particleSystem.mNumDiffuseBuffers)
{
const PxU32 threadIndexInWarp = threadIdx.x & 31;
float4* PX_RESTRICT unsortedPositions = reinterpret_cast<float4*>(particleSystem.mDiffusePosition_LifeTime);
float4* PX_RESTRICT unsortedVels = reinterpret_cast<float4*>(particleSystem.mDiffuseVelocity);
PxU32 localSum = 0;
for (PxU32 i = threadIndexInWarp; i < bufferIndex; i += WARP_SIZE)
{
localSum += particleSystem.mDiffuseSimBuffers[i].mNumDiffuseParticles[0];
}
PxU32 bufferOffset = warpReduction<AddOpPxU32, PxU32>(FULL_MASK, localSum);
PxgParticleDiffuseSimBuffer& buffer = particleSystem.mDiffuseSimBuffers[bufferIndex];
int numDiffuseParticles = buffer.mNumDiffuseParticles[0];
const float4* particles = buffer.mDiffusePositions_LifeTime;
const float4* vels = buffer.mDiffuseVelocities;
const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
if (globalThreadIndex >= numDiffuseParticles)
return;
if (globalThreadIndex == 0)
{
buffer.mStartIndex = bufferOffset;
}
const PxU32 ind = bufferOffset + globalThreadIndex;
unsortedPositions[ind] = particles[globalThreadIndex];
unsortedVels[ind] = vels[globalThreadIndex];
}
}
extern "C" __global__ void ps_diffuseParticleOneWayCollision(
PxgParticleSystem * PX_RESTRICT particleSystems,
const PxU32* PX_RESTRICT activeParticleSystems,
const PxU32 count
)
{
__shared__ __align__(16) PxU8 particleSystemMemory[sizeof(PxgParticleSystem)];
PxgParticleSystem& shParticleSystem = *(reinterpret_cast<PxgParticleSystem*>(particleSystemMemory));
const PxU32 id = activeParticleSystems[blockIdx.y];
const PxgParticleSystem& particleSystem = particleSystems[id];
const uint2* sParticleSystem = reinterpret_cast<const uint2*>(&particleSystem);
uint2* dParticleSystem = reinterpret_cast<uint2*>(&shParticleSystem);
blockCopy<uint2>(dParticleSystem, sParticleSystem, sizeof(PxgParticleSystem));
__syncthreads();
const PxU32 pi = threadIdx.x + blockIdx.x * blockDim.x;
const PxU32 numParticles = *shParticleSystem.mNumDiffuseParticles;
if (pi >= numParticles)
return;
float4* PX_RESTRICT newPos = reinterpret_cast<float4*>(shParticleSystem.mDiffuseSortedPos_LifeTime);
const PxgParticleContactInfo* PX_RESTRICT contacts = shParticleSystem.mDiffuseOneWayContactInfos;
const PxU32* PX_RESTRICT contactCounts = shParticleSystem.mDiffuseOneWayContactCount;
const PxU32 contactCount = PxMin(PxgParticleContactInfo::MaxStaticContactsPerParticle, contactCounts[pi]);
if (contactCount)
{
PxVec3 posCorr = PxLoad3(newPos[pi]);
for (PxU32 c = 0, offset = pi; c < contactCount; ++c, offset += numParticles)
{
const PxgParticleContactInfo& contact = contacts[offset];
const PxVec3 surfaceNormal = PxLoad3(contact.mNormal_PenW);
const PxVec3 deltaP = -surfaceNormal * contact.mNormal_PenW.w;
posCorr += deltaP;
}
newPos[pi] = make_float4(posCorr.x, posCorr.y, posCorr.z, newPos[pi].w);
}
}
extern "C" __global__ void ps_diffuseParticleUpdatePBF(
PxgParticleSystem* PX_RESTRICT particleSystems,
const PxU32* activeParticleSystems,
const PxVec3 gravity,
const PxReal dt)
{
__shared__ __align__(16) PxU8 particleSystemMemory[sizeof(PxgParticleSystem)];
PxgParticleSystem& shParticleSystem = *(reinterpret_cast<PxgParticleSystem*>(particleSystemMemory));
__shared__ int offset[3];
if (threadIdx.x == 0)
{
offset[0] = 0; offset[1] = -1; offset[2] = 1;
}
const PxU32 id = activeParticleSystems[blockIdx.y];
const PxgParticleSystem& particleSystem = particleSystems[id];
const uint2* sParticleSystem = reinterpret_cast<const uint2*>(&particleSystem);
uint2* dParticleSystem = reinterpret_cast<uint2*>(&shParticleSystem);
blockCopy<uint2>(dParticleSystem, sParticleSystem, sizeof(PxgParticleSystem));
__syncthreads();
{
int numDiffuse = *shParticleSystem.mNumDiffuseParticles;
const PxU32 pi = threadIdx.x + blockIdx.x * blockDim.x;
if (pi >= numDiffuse)
return;
const PxU32* const PX_RESTRICT cellStarts = shParticleSystem.mCellStart;
const PxU32* const PX_RESTRICT cellEnds = shParticleSystem.mCellEnd;
// per-particle data
const float4* const PX_RESTRICT sortedPose = reinterpret_cast<float4*>(shParticleSystem.mSortedPositions_InvMass);
const float4* const PX_RESTRICT sortedVel = reinterpret_cast<float4*>(shParticleSystem.mSortedVelocities);
float4* PX_RESTRICT diffusePositions = reinterpret_cast<float4*>(shParticleSystem.mDiffuseSortedPos_LifeTime);
//Overloading this buffer to store the new velocity...
float4* PX_RESTRICT newVel = reinterpret_cast<float4*>(shParticleSystem.mDiffuseSortedOriginPos_LifeTime);
// get elements
const float4 xi4 = diffusePositions[pi];
const PxVec3 pos = PxLoad3(xi4);
// interpolate
PxVec3 velAvg(PxZero);
PxU32 numNeighbors = 0;
const PxReal cellWidth = shParticleSystem.mCommonData.mGridCellWidth;
const PxReal contactDistanceSq = shParticleSystem.mCommonData.mParticleContactDistanceSq;
const PxReal invContactDistance = shParticleSystem.mCommonData.mParticleContactDistanceInv;
const int3 gridPos = calcGridPos(xi4, cellWidth);
const uint3 gridSize = make_uint3(shParticleSystem.mCommonData.mGridSizeX, shParticleSystem.mCommonData.mGridSizeY, shParticleSystem.mCommonData.mGridSizeZ);
// Iterate over cell
PxReal weightSum = 0.0f;
PxVec3 velocitySum(0.f);
const PxU32 maxNeighbors = 16;
const PxU32 end = (shParticleSystem.mData.mFlags & PxParticleFlag::eFULL_DIFFUSE_ADVECTION) ? 3 : 1;
for (int z = 0; z < end; ++z)
for (int y = 0; y < end; ++y)
for (int x = 0; x < end; ++x)
{
const int3 neighbourPos = make_int3(gridPos.x + offset[x], gridPos.y + offset[y], gridPos.z + offset[z]);
const PxU32 gridHash = calcGridHash(neighbourPos, gridSize);
const PxU32 startIndex = cellStarts[gridHash];
if (startIndex != EMPTY_CELL)
{
const PxU32 endIndex = cellEnds[gridHash];
for (PxU32 q = startIndex; q < endIndex; ++q)
{
const PxVec3 xj = PxLoad3(sortedPose[q]);
const PxVec3 xij = pos - xj;
const PxReal dSq = xij.dot(xij);
if (dSq < contactDistanceSq)
{
const PxVec3 vj = PxLoad3(sortedVel[q]);
const PxReal w = WDiffuse(sqrtf(dSq), invContactDistance);
weightSum += w;
velocitySum += vj * w;
++numNeighbors;
if (numNeighbors == maxNeighbors)
goto weight_sum;
}
}
}
}
weight_sum:
if (weightSum > 0)
velAvg = velocitySum / weightSum;
newVel[pi] = make_float4(velAvg.x, velAvg.y, velAvg.z, PxReal(numNeighbors));
}
}
extern "C" __global__ void ps_diffuseParticleCompact(
PxgParticleSystem* PX_RESTRICT particleSystems,
const PxU32* activeParticleSystems,
const PxVec3 gravity,
const PxReal dt)
{
__shared__ __align__(16) PxU8 particleSystemMemory[sizeof(PxgParticleSystem)];
PxgParticleSystem& shParticleSystem = *(reinterpret_cast<PxgParticleSystem*>(particleSystemMemory));
const PxU32 id = activeParticleSystems[blockIdx.z];
const PxgParticleSystem& particleSystem = particleSystems[id];
const uint2* sParticleSystem = reinterpret_cast<const uint2*>(&particleSystem);
uint2* dParticleSystem = reinterpret_cast<uint2*>(&shParticleSystem);
blockCopy<uint2>(dParticleSystem, sParticleSystem, sizeof(PxgParticleSystem));
__syncthreads();
const PxU32 bufferIndex = blockIdx.y;
if (bufferIndex < shParticleSystem.mNumDiffuseBuffers)
{
PxgParticleDiffuseSimBuffer& buffer = shParticleSystem.mDiffuseSimBuffers[bufferIndex];
int* numDiffuseParticles = buffer.mNumDiffuseParticles;
int numDiffuse = numDiffuseParticles[0];
const PxU32 pi = threadIdx.x + blockIdx.x * blockDim.x;
const PxU32 threadIndexInWarp = threadIdx.x & 31;
if (pi >= numDiffuse)
return;
float4* PX_RESTRICT diffusePositionsNew = buffer.mDiffusePositions_LifeTime;
float4* PX_RESTRICT diffuseVelocitiesNew = buffer.mDiffuseVelocities;
float4* PX_RESTRICT velAvgs = reinterpret_cast<float4*>(shParticleSystem.mDiffuseSortedOriginPos_LifeTime);
float4* PX_RESTRICT diffusePositions = reinterpret_cast<float4*>(shParticleSystem.mDiffuseSortedPos_LifeTime);
float4* PX_RESTRICT diffusePositionsOld = reinterpret_cast<float4*>(shParticleSystem.mDiffuseOriginPos_LifeTime);
const PxU32* reverseLookup = shParticleSystem.mDiffuseUnsortedToSortedMapping;
const PxU32 index = pi + buffer.mStartIndex;
const PxU32 sortedInd = reverseLookup[index];
// get elements
const float4 xi4 = diffusePositions[sortedInd];
const float4 vi4Old = diffusePositionsOld[index];
const float4 xiva4 = velAvgs[sortedInd];
const PxVec3 pos = PxLoad3(xi4);
const PxVec3 oldPos = PxLoad3(vi4Old);
const PxVec3 velAvg = PxLoad3(xiva4);
const PxReal lifeDelta = dt;
PxVec3 vel = (pos - oldPos)*(1.f / dt);
// integrate diffuse particle
PxVec3 newVel;
if (xiva4.w < 4.f)
{
// spray (ballistic)
newVel = vel * (1.0f - buffer.mParams.airDrag * dt);
}
else if (xiva4.w < 8.f)
{
// foam
newVel = velAvg;
}
else
{
// bubble
newVel = vel - (1.f + buffer.mParams.buoyancy) * gravity * dt + buffer.mParams.bubbleDrag * (velAvg - vel);
}
const float maxVel = shParticleSystem.mData.mMaxVelocity;
if (newVel.magnitudeSquared() > 0)
{
newVel = PxMin(newVel.magnitude(), maxVel) * newVel.getNormalized();
}
PxVec3 newPosCorr = pos + (newVel - vel) * dt;
PxVec3 newVelCorr = newVel;
__syncwarp();
const PxReal lifeTime = fmaxf(xi4.w - lifeDelta, 0.0f);
PxU32 res = __ballot_sync(FULL_MASK, lifeTime > 0.f);
PxU32 offset = 0;
if (threadIndexInWarp == 0)
offset = atomicAdd(&numDiffuseParticles[1], __popc(res));
offset = __shfl_sync(FULL_MASK, offset, 0);
if (lifeTime > 0.f)
{
PxU32 newIndex = offset + warpScanExclusive(res, threadIndexInWarp);
diffusePositionsNew[newIndex] = make_float4(newPosCorr.x, newPosCorr.y, newPosCorr.z, lifeTime);
diffuseVelocitiesNew[newIndex] = make_float4(newVelCorr.x, newVelCorr.y, newVelCorr.z, 0.0f);
}
}
}
extern "C" __global__ void ps_diffuseParticleCreate(
PxgParticleSystem * PX_RESTRICT particleSystems,
const PxU32* const PX_RESTRICT activeParticleSystems,
const PxReal* const PX_RESTRICT randomTable,
const PxU32 randomTableSize,
const PxReal dt)
{
__shared__ __align__(16) PxU8 particleSystemMemory[sizeof(PxgParticleSystem)];
PxgParticleSystem& shParticleSystem = *(reinterpret_cast<PxgParticleSystem*>(particleSystemMemory));
const PxU32 id = activeParticleSystems[blockIdx.z];
const PxgParticleSystem& particleSystem = particleSystems[id];
const uint2* sParticleSystem = reinterpret_cast<const uint2*>(&particleSystem);
uint2* dParticleSystem = reinterpret_cast<uint2*>(&shParticleSystem);
blockCopy<uint2>(dParticleSystem, sParticleSystem, sizeof(PxgParticleSystem));
__syncthreads();
const PxU32 bufferIndex = blockIdx.y;
if (bufferIndex < shParticleSystem.mCommonData.mNumParticleBuffers)
{
const PxgParticleSimBuffer& buffer = shParticleSystem.mParticleSimBuffers[bufferIndex];
const PxU32 diffuseParticleBufferIndex = buffer.mDiffuseParticleBufferIndex;
if (diffuseParticleBufferIndex == 0xffffffff)
return;
const PxgParticleSystemData& data = shParticleSystem.mData;
const PxU32 pi = threadIdx.x + blockIdx.x * blockDim.x;
const PxU32 numParticles = buffer.mNumActiveParticles;
if (pi >= numParticles)
return;
PxgParticleDiffuseSimBuffer& diffuseBuffer = shParticleSystem.mDiffuseSimBuffers[diffuseParticleBufferIndex];
if (diffuseBuffer.mMaxNumParticles == 0)
return;
// get arrays
const float4* const PX_RESTRICT sortedPose = reinterpret_cast<float4*>(shParticleSystem.mSortedPositions_InvMass);
const float4* const PX_RESTRICT sortedVel = reinterpret_cast<float4*>(shParticleSystem.mSortedVelocities);
const PxU32* PX_RESTRICT phases = shParticleSystem.mSortedPhaseArray;
const float2* const PX_RESTRICT potentials = reinterpret_cast<float2*>(shParticleSystem.mDiffusePotentials);
float4* PX_RESTRICT diffusePositionsNew = diffuseBuffer.mDiffusePositions_LifeTime;
float4* PX_RESTRICT diffuseVelocitiesNew = diffuseBuffer.mDiffuseVelocities;
int* numDiffuseParticles = diffuseBuffer.mNumDiffuseParticles;
const PxU32* reverseLookup = shParticleSystem.mUnsortedToSortedMapping;
const PxU32 offset = particleSystem.mParticleBufferRunsum[bufferIndex];
const PxU32 sortedInd = reverseLookup[pi + offset];
// get elements
const float2 ptnts = potentials[sortedInd];
const PxReal threshold = diffuseBuffer.mParams.threshold;
const PxU32 phase = phases[sortedInd];
if (!PxGetFluid(phase))
return;
const float4 vi4 = sortedVel[sortedInd];
//Kinetic energy + pressure
const PxReal kineticEnergy = dot3(vi4, vi4) * diffuseBuffer.mParams.kineticEnergyWeight;
const PxReal divergence = diffuseBuffer.mParams.divergenceWeight * ptnts.x;
const PxReal pressure = diffuseBuffer.mParams.pressureWeight * ptnts.y;
PxReal intensity = pressure - divergence + kineticEnergy;
//if (pi == 0)
// printf("numParticles %i diffuseParticleBufferIndex %i numDiffuseParticles[1] %i threshold %f\n", numParticles, diffuseParticleBufferIndex, numDiffuseParticles[1], threshold);
const PxReal r0 = randomTable[(sortedInd + 0) % randomTableSize];
if(r0 * intensity > threshold)
{
const float4 xi4 = sortedPose[sortedInd];
//for (int i=0; i < 5; ++i)
{
// try and allocate new diffuse particles
const int newIndex = atomicAdd(&numDiffuseParticles[1], 1);
if (newIndex < diffuseBuffer.mMaxNumParticles)
{
const PxVec3 xi = PxLoad3(xi4);
const PxVec3 vi = PxLoad3(vi4);
const PxReal r1 = randomTable[(sortedInd + 1) % randomTableSize];
const PxReal r2 = randomTable[(sortedInd + 2) % randomTableSize];
const PxReal r3 = randomTable[(sortedInd + 3) % randomTableSize];
const PxReal lifeMin = 1.0f;
const PxReal lifeMax = diffuseBuffer.mParams.lifetime;
const PxReal lifeScale = fminf(intensity / threshold, 1.f) * r1;
const PxReal lifetime = lifeMin + lifeScale * (lifeMax - lifeMin);
const PxVec3 q = xi - r2 * vi * dt + PxVec3(r1, r2, r3) * data.mRestOffset * 0.25f;
diffusePositionsNew[newIndex] = make_float4(q.x, q.y, q.z, lifetime);
diffuseVelocitiesNew[newIndex] = make_float4(vi.x, vi.y, vi.z, 0.0f);
}
}
}
}
}
extern "C" __global__ void ps_diffuseParticleCopy(
PxgParticleSystem * PX_RESTRICT particleSystems,
const PxU32* const PX_RESTRICT activeParticleSystems,
const PxU32 count)
{
const PxU32 id = activeParticleSystems[blockIdx.z];
PxgParticleSystem& particleSystem = particleSystems[id];
const PxU32 numDiffuseBuffers = particleSystem.mNumDiffuseBuffers;
const PxU32 bufferIndex = blockIdx.y;
if (bufferIndex < numDiffuseBuffers)
{
PxgParticleDiffuseSimBuffer& diffuseBuffer = particleSystem.mDiffuseSimBuffers[bufferIndex];
int* numDiffuseParticles = diffuseBuffer.mNumDiffuseParticles;
const PxU32 numDiffuse = PxMin(PxI32(diffuseBuffer.mMaxNumParticles), numDiffuseParticles[1]);
*diffuseBuffer.mNumActiveDiffuseParticles = numDiffuse; //pinned memory
numDiffuseParticles[0] = numDiffuse;
numDiffuseParticles[1] = 0;
}
}
extern "C" __global__ void ps_diffuseParticleSum(
PxgParticleSystem * PX_RESTRICT particleSystems,
const PxU32* const PX_RESTRICT activeParticleSystems,
const PxU32 count)
{
const PxU32 id = activeParticleSystems[blockIdx.x];
PxgParticleSystem& particleSystem = particleSystems[id];
const PxU32 numDiffuseBuffers = particleSystem.mNumDiffuseBuffers;
PxU32 totalDiffuse = 0;
for (PxU32 i = threadIdx.x; i < numDiffuseBuffers; i += WARP_SIZE)
{
PxgParticleDiffuseSimBuffer& diffuseBuffer = particleSystem.mDiffuseSimBuffers[i];
totalDiffuse += diffuseBuffer.mNumDiffuseParticles[0];
}
totalDiffuse = warpReduction<AddOpPxU32, PxU32>(FULL_MASK, totalDiffuse);
if(threadIdx.x == 0)
{
*particleSystem.mNumDiffuseParticles = totalDiffuse;
}
}