engine/third_party/physx/source/gpusimulationcontroller/src/CUDA/diffuseParticles.cu

// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of NVIDIA CORPORATION nor the names of its
//    contributors may be used to endorse or promote products derived
//    from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  


#include "vector_types.h"
#include "foundation/PxVec3.h"
#include "foundation/PxVec4.h"
#include "foundation/PxBounds3.h"
#include "PxgParticleSystemCore.h"
#include "PxgParticleSystem.h"
#include "PxgParticleSystemCoreKernelIndices.h"
#include "PxgBodySim.h"
#include "PxgCommonDefines.h"
#include "reduction.cuh"
#include "shuffle.cuh"
#include "stdio.h"
#include "PxgSolverBody.h"
#include "PxgSolverCoreDesc.h"
#include "PxParticleSystem.h"
#include "assert.h"
#include "copy.cuh"
#include "PxgSimulationCoreDesc.h"
#include "gridCal.cuh"
#include "particleSystem.cuh"
#include "atomic.cuh"
#include "utils.cuh"

using namespace physx;

// simpler kernel for diffuse weighting
__device__ inline PxReal WDiffuse(const PxReal h, const PxReal invR)
{
	return (1.0f - h * invR);
}

extern "C" __host__ void initDiffuseParticlesKernels0() {}

extern "C" __global__ void ps_updateUnsortedDiffuseArrayLaunch(
	const PxgParticleSystem * PX_RESTRICT particleSystems,
	const PxU32 * PX_RESTRICT activeParticleSystems)
{
	const PxU32 particleId = activeParticleSystems[blockIdx.z];

	const PxgParticleSystem& particleSystem = particleSystems[particleId];

	const PxU32 bufferIndex = blockIdx.y;

	if (bufferIndex < particleSystem.mNumDiffuseBuffers)
	{
		const PxU32 threadIndexInWarp = threadIdx.x & 31;

		float4* PX_RESTRICT unsortedPositions = reinterpret_cast<float4*>(particleSystem.mDiffusePosition_LifeTime);
		float4* PX_RESTRICT unsortedVels = reinterpret_cast<float4*>(particleSystem.mDiffuseVelocity);

		PxU32 localSum = 0;

		for (PxU32 i = threadIndexInWarp; i < bufferIndex; i += WARP_SIZE)
		{
			localSum += particleSystem.mDiffuseSimBuffers[i].mNumDiffuseParticles[0];
		}

		PxU32 bufferOffset = warpReduction<AddOpPxU32, PxU32>(FULL_MASK, localSum);

		PxgParticleDiffuseSimBuffer& buffer = particleSystem.mDiffuseSimBuffers[bufferIndex];

		int numDiffuseParticles = buffer.mNumDiffuseParticles[0];

		const float4* particles = buffer.mDiffusePositions_LifeTime;
		const float4* vels = buffer.mDiffuseVelocities;

		const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
		if (globalThreadIndex >= numDiffuseParticles)
			return;

		if (globalThreadIndex == 0)
		{
			buffer.mStartIndex = bufferOffset;
		}

		const PxU32 ind = bufferOffset + globalThreadIndex;
		unsortedPositions[ind] = particles[globalThreadIndex];
		unsortedVels[ind] = vels[globalThreadIndex];
	}
}

extern "C" __global__ void ps_diffuseParticleOneWayCollision(
	PxgParticleSystem * PX_RESTRICT	particleSystems,
	const PxU32* PX_RESTRICT			activeParticleSystems,
	const PxU32 count
)
{
	__shared__ __align__(16) PxU8 particleSystemMemory[sizeof(PxgParticleSystem)];
	PxgParticleSystem& shParticleSystem = *(reinterpret_cast<PxgParticleSystem*>(particleSystemMemory));

	const PxU32 id = activeParticleSystems[blockIdx.y];

	const PxgParticleSystem& particleSystem = particleSystems[id];

	const uint2* sParticleSystem = reinterpret_cast<const uint2*>(&particleSystem);
	uint2* dParticleSystem = reinterpret_cast<uint2*>(&shParticleSystem);

	blockCopy<uint2>(dParticleSystem, sParticleSystem, sizeof(PxgParticleSystem));
	__syncthreads();

	const PxU32 pi = threadIdx.x + blockIdx.x * blockDim.x;

	const PxU32 numParticles = *shParticleSystem.mNumDiffuseParticles;


	if (pi >= numParticles)
		return;

	float4* PX_RESTRICT newPos = reinterpret_cast<float4*>(shParticleSystem.mDiffuseSortedPos_LifeTime);

	const PxgParticleContactInfo* PX_RESTRICT contacts = shParticleSystem.mDiffuseOneWayContactInfos;
	const PxU32* PX_RESTRICT contactCounts = shParticleSystem.mDiffuseOneWayContactCount;

	const PxU32 contactCount = PxMin(PxgParticleContactInfo::MaxStaticContactsPerParticle, contactCounts[pi]);

	
	if (contactCount)
	{
		PxVec3 posCorr = PxLoad3(newPos[pi]);
		for (PxU32 c = 0, offset = pi; c < contactCount; ++c, offset += numParticles)
		{
			const PxgParticleContactInfo& contact = contacts[offset];

			const PxVec3 surfaceNormal = PxLoad3(contact.mNormal_PenW);

			const PxVec3 deltaP = -surfaceNormal * contact.mNormal_PenW.w;
			posCorr += deltaP;
		}

		newPos[pi] = make_float4(posCorr.x, posCorr.y, posCorr.z, newPos[pi].w);
	}
}


extern "C" __global__ void ps_diffuseParticleUpdatePBF(
	PxgParticleSystem* PX_RESTRICT				particleSystems,
	const PxU32*								activeParticleSystems,
	const PxVec3								gravity,
	const PxReal								dt)
{
	__shared__ __align__(16) PxU8 particleSystemMemory[sizeof(PxgParticleSystem)];
	PxgParticleSystem& shParticleSystem = *(reinterpret_cast<PxgParticleSystem*>(particleSystemMemory));

	__shared__ int offset[3];

	if (threadIdx.x == 0)
	{
		offset[0] = 0; offset[1] = -1; offset[2] = 1;
	}

	const PxU32 id = activeParticleSystems[blockIdx.y];

	const PxgParticleSystem& particleSystem = particleSystems[id];

	const uint2* sParticleSystem = reinterpret_cast<const uint2*>(&particleSystem);
	uint2* dParticleSystem = reinterpret_cast<uint2*>(&shParticleSystem);

	blockCopy<uint2>(dParticleSystem, sParticleSystem, sizeof(PxgParticleSystem));
	__syncthreads();

	{
		int numDiffuse = *shParticleSystem.mNumDiffuseParticles;			

		const PxU32 pi = threadIdx.x + blockIdx.x * blockDim.x;

		if (pi >= numDiffuse)
			return;

		const PxU32* const PX_RESTRICT cellStarts = shParticleSystem.mCellStart;
		const PxU32* const PX_RESTRICT cellEnds = shParticleSystem.mCellEnd;

		// per-particle data
		const float4* const PX_RESTRICT sortedPose = reinterpret_cast<float4*>(shParticleSystem.mSortedPositions_InvMass);
		const float4* const PX_RESTRICT sortedVel = reinterpret_cast<float4*>(shParticleSystem.mSortedVelocities);

		float4* PX_RESTRICT diffusePositions = reinterpret_cast<float4*>(shParticleSystem.mDiffuseSortedPos_LifeTime);

		//Overloading this buffer to store the new velocity...
		float4* PX_RESTRICT newVel = reinterpret_cast<float4*>(shParticleSystem.mDiffuseSortedOriginPos_LifeTime);

		// get elements
		const float4 xi4 = diffusePositions[pi];
		const PxVec3 pos = PxLoad3(xi4);

		// interpolate		
		PxVec3 velAvg(PxZero);
		PxU32 numNeighbors = 0;

		const PxReal cellWidth = shParticleSystem.mCommonData.mGridCellWidth;
		const PxReal contactDistanceSq = shParticleSystem.mCommonData.mParticleContactDistanceSq;
		const PxReal invContactDistance = shParticleSystem.mCommonData.mParticleContactDistanceInv;
		const int3 gridPos = calcGridPos(xi4, cellWidth);
		const uint3 gridSize = make_uint3(shParticleSystem.mCommonData.mGridSizeX, shParticleSystem.mCommonData.mGridSizeY, shParticleSystem.mCommonData.mGridSizeZ);

		// Iterate over cell
		PxReal weightSum = 0.0f;
		PxVec3 velocitySum(0.f);

		const PxU32 maxNeighbors = 16;

		const PxU32 end = (shParticleSystem.mData.mFlags & PxParticleFlag::eFULL_DIFFUSE_ADVECTION) ? 3 : 1;


		for (int z = 0; z < end; ++z)
			for (int y = 0; y < end; ++y)
				for (int x = 0; x < end; ++x)
				{
					const int3 neighbourPos = make_int3(gridPos.x + offset[x], gridPos.y + offset[y], gridPos.z + offset[z]);
					const PxU32 gridHash = calcGridHash(neighbourPos, gridSize);
					const PxU32 startIndex = cellStarts[gridHash];

					if (startIndex != EMPTY_CELL)
					{
						const PxU32 endIndex = cellEnds[gridHash];
						for (PxU32 q = startIndex; q < endIndex; ++q)
						{
							const PxVec3 xj = PxLoad3(sortedPose[q]);
							const PxVec3 xij = pos - xj;

							const PxReal dSq = xij.dot(xij);

							if (dSq < contactDistanceSq)
							{
								const PxVec3 vj = PxLoad3(sortedVel[q]);
								const PxReal w = WDiffuse(sqrtf(dSq), invContactDistance);

								weightSum += w;
								velocitySum += vj * w;

								++numNeighbors;
								if (numNeighbors == maxNeighbors)
									goto weight_sum;
							}
						}
					}
				}

	weight_sum:
		if (weightSum > 0)
			velAvg = velocitySum / weightSum;

		newVel[pi] = make_float4(velAvg.x, velAvg.y, velAvg.z, PxReal(numNeighbors));
	}
}

extern "C" __global__ void ps_diffuseParticleCompact(
	PxgParticleSystem* PX_RESTRICT				particleSystems,
	const PxU32*								activeParticleSystems,
	const PxVec3								gravity,
	const PxReal								dt)
{
	__shared__ __align__(16) PxU8 particleSystemMemory[sizeof(PxgParticleSystem)];
	PxgParticleSystem& shParticleSystem = *(reinterpret_cast<PxgParticleSystem*>(particleSystemMemory));

	const PxU32 id = activeParticleSystems[blockIdx.z];

	const PxgParticleSystem& particleSystem = particleSystems[id];

	const uint2* sParticleSystem = reinterpret_cast<const uint2*>(&particleSystem);
	uint2* dParticleSystem = reinterpret_cast<uint2*>(&shParticleSystem);

	blockCopy<uint2>(dParticleSystem, sParticleSystem, sizeof(PxgParticleSystem));
    __syncthreads();

	const PxU32 bufferIndex = blockIdx.y;
	if (bufferIndex < shParticleSystem.mNumDiffuseBuffers)
	{

		PxgParticleDiffuseSimBuffer& buffer = shParticleSystem.mDiffuseSimBuffers[bufferIndex];

		int* numDiffuseParticles = buffer.mNumDiffuseParticles;
		int numDiffuse = numDiffuseParticles[0];

		const PxU32 pi = threadIdx.x + blockIdx.x * blockDim.x;
		const PxU32 threadIndexInWarp = threadIdx.x & 31;

		if (pi >= numDiffuse)
			return;

		float4* PX_RESTRICT diffusePositionsNew = buffer.mDiffusePositions_LifeTime;
		float4* PX_RESTRICT diffuseVelocitiesNew = buffer.mDiffuseVelocities;

		float4* PX_RESTRICT velAvgs = reinterpret_cast<float4*>(shParticleSystem.mDiffuseSortedOriginPos_LifeTime);

		float4* PX_RESTRICT diffusePositions = reinterpret_cast<float4*>(shParticleSystem.mDiffuseSortedPos_LifeTime);
		float4* PX_RESTRICT diffusePositionsOld = reinterpret_cast<float4*>(shParticleSystem.mDiffuseOriginPos_LifeTime);
		
		const PxU32* reverseLookup = shParticleSystem.mDiffuseUnsortedToSortedMapping;
		
		const PxU32 index = pi + buffer.mStartIndex;
		const PxU32 sortedInd = reverseLookup[index];

		// get elements
		const float4 xi4 = diffusePositions[sortedInd];
		const float4 vi4Old = diffusePositionsOld[index];
		const float4 xiva4 = velAvgs[sortedInd];
		const PxVec3 pos = PxLoad3(xi4);
		const PxVec3 oldPos = PxLoad3(vi4Old);
		const PxVec3 velAvg = PxLoad3(xiva4);

		const PxReal lifeDelta = dt;

		PxVec3 vel = (pos - oldPos)*(1.f / dt);

		// integrate diffuse particle
		PxVec3 newVel;
		if (xiva4.w < 4.f)
		{
			// spray (ballistic)
			newVel = vel * (1.0f - buffer.mParams.airDrag * dt);
		}
		else if (xiva4.w < 8.f)
		{
			// foam
			newVel = velAvg;
		}
		else
		{
			// bubble
			newVel = vel - (1.f + buffer.mParams.buoyancy) * gravity * dt + buffer.mParams.bubbleDrag * (velAvg - vel);
		}

		const float maxVel = shParticleSystem.mData.mMaxVelocity;
		if (newVel.magnitudeSquared() > 0)
		{
			newVel = PxMin(newVel.magnitude(), maxVel) * newVel.getNormalized();
		}

		PxVec3 newPosCorr = pos + (newVel - vel) * dt;
		PxVec3 newVelCorr = newVel;

		__syncwarp();

		const PxReal lifeTime = fmaxf(xi4.w - lifeDelta, 0.0f);

		PxU32 res = __ballot_sync(FULL_MASK, lifeTime > 0.f);

		PxU32 offset = 0;

		if (threadIndexInWarp == 0)
			offset = atomicAdd(&numDiffuseParticles[1], __popc(res));

		offset = __shfl_sync(FULL_MASK, offset, 0);


		if (lifeTime > 0.f)
		{
			PxU32 newIndex = offset + warpScanExclusive(res, threadIndexInWarp);
			
			diffusePositionsNew[newIndex] = make_float4(newPosCorr.x, newPosCorr.y, newPosCorr.z, lifeTime);
			diffuseVelocitiesNew[newIndex] = make_float4(newVelCorr.x, newVelCorr.y, newVelCorr.z, 0.0f);
		}
	}
}

extern "C" __global__ void ps_diffuseParticleCreate(
	PxgParticleSystem * PX_RESTRICT			particleSystems,
	const PxU32* const PX_RESTRICT				activeParticleSystems,
	const PxReal* const PX_RESTRICT			randomTable,
	const PxU32									randomTableSize,
	const PxReal								dt)
{
	__shared__ __align__(16) PxU8 particleSystemMemory[sizeof(PxgParticleSystem)];
	PxgParticleSystem& shParticleSystem = *(reinterpret_cast<PxgParticleSystem*>(particleSystemMemory));

	const PxU32 id = activeParticleSystems[blockIdx.z];
	const PxgParticleSystem& particleSystem = particleSystems[id];

	const uint2* sParticleSystem = reinterpret_cast<const uint2*>(&particleSystem);
	uint2* dParticleSystem = reinterpret_cast<uint2*>(&shParticleSystem);

	blockCopy<uint2>(dParticleSystem, sParticleSystem, sizeof(PxgParticleSystem));
	__syncthreads();

	const PxU32 bufferIndex = blockIdx.y;
	if (bufferIndex < shParticleSystem.mCommonData.mNumParticleBuffers)
	{

		const PxgParticleSimBuffer& buffer = shParticleSystem.mParticleSimBuffers[bufferIndex];

		const PxU32 diffuseParticleBufferIndex = buffer.mDiffuseParticleBufferIndex;

		if (diffuseParticleBufferIndex == 0xffffffff)
			return;

		const PxgParticleSystemData& data = shParticleSystem.mData;

		const PxU32 pi = threadIdx.x + blockIdx.x * blockDim.x;

		const PxU32 numParticles = buffer.mNumActiveParticles;

		if (pi >= numParticles)
			return;


		PxgParticleDiffuseSimBuffer& diffuseBuffer = shParticleSystem.mDiffuseSimBuffers[diffuseParticleBufferIndex];

		if (diffuseBuffer.mMaxNumParticles == 0)
			return;
	
		// get arrays
		const float4* const PX_RESTRICT sortedPose = reinterpret_cast<float4*>(shParticleSystem.mSortedPositions_InvMass);
		const float4* const PX_RESTRICT sortedVel = reinterpret_cast<float4*>(shParticleSystem.mSortedVelocities);
		const PxU32* PX_RESTRICT phases = shParticleSystem.mSortedPhaseArray;
		const float2* const PX_RESTRICT potentials = reinterpret_cast<float2*>(shParticleSystem.mDiffusePotentials);
		
		float4* PX_RESTRICT diffusePositionsNew = diffuseBuffer.mDiffusePositions_LifeTime;
		float4* PX_RESTRICT diffuseVelocitiesNew = diffuseBuffer.mDiffuseVelocities;

		int* numDiffuseParticles = diffuseBuffer.mNumDiffuseParticles;
		
		const PxU32* reverseLookup = shParticleSystem.mUnsortedToSortedMapping;
		const PxU32 offset = particleSystem.mParticleBufferRunsum[bufferIndex];

		const PxU32 sortedInd = reverseLookup[pi + offset];
		// get elements
		const float2 ptnts = potentials[sortedInd];
		const PxReal threshold = diffuseBuffer.mParams.threshold;
		const PxU32 phase = phases[sortedInd];

		if (!PxGetFluid(phase))
			return;

		const float4 vi4 = sortedVel[sortedInd];

		//Kinetic energy + pressure
		const PxReal kineticEnergy = dot3(vi4, vi4) * diffuseBuffer.mParams.kineticEnergyWeight;
		const PxReal divergence = diffuseBuffer.mParams.divergenceWeight * ptnts.x;
		const PxReal pressure = diffuseBuffer.mParams.pressureWeight * ptnts.y;
		PxReal intensity = pressure - divergence + kineticEnergy;

		//if (pi == 0)
		//	printf("numParticles %i diffuseParticleBufferIndex %i numDiffuseParticles[1] %i threshold %f\n", numParticles, diffuseParticleBufferIndex, numDiffuseParticles[1], threshold);

		const PxReal r0 = randomTable[(sortedInd + 0) % randomTableSize];

		if(r0 * intensity > threshold)
		{
			const float4 xi4 = sortedPose[sortedInd];
			

			//for (int i=0; i < 5; ++i)
			{
				// try and allocate new diffuse particles
				const int newIndex = atomicAdd(&numDiffuseParticles[1], 1);

				if (newIndex < diffuseBuffer.mMaxNumParticles)
				{
					
					const PxVec3 xi = PxLoad3(xi4);
					const PxVec3 vi = PxLoad3(vi4);

					const PxReal r1 = randomTable[(sortedInd + 1) % randomTableSize];
					const PxReal r2 = randomTable[(sortedInd + 2) % randomTableSize];
					const PxReal r3 = randomTable[(sortedInd + 3) % randomTableSize];

					const PxReal lifeMin = 1.0f;
					const PxReal lifeMax = diffuseBuffer.mParams.lifetime;
					const PxReal lifeScale = fminf(intensity / threshold, 1.f) * r1;
					const PxReal lifetime = lifeMin + lifeScale * (lifeMax - lifeMin);

					const PxVec3 q = xi - r2 * vi * dt + PxVec3(r1, r2, r3) * data.mRestOffset * 0.25f;

					diffusePositionsNew[newIndex] = make_float4(q.x, q.y, q.z, lifetime);
					diffuseVelocitiesNew[newIndex] = make_float4(vi.x, vi.y, vi.z, 0.0f);
				}
			}
		}
	}
}


extern "C" __global__ void ps_diffuseParticleCopy(
	PxgParticleSystem * PX_RESTRICT	particleSystems,
	const PxU32* const PX_RESTRICT	activeParticleSystems,
	const PxU32 count)
{
	const PxU32 id = activeParticleSystems[blockIdx.z];
	PxgParticleSystem& particleSystem = particleSystems[id];

	const PxU32 numDiffuseBuffers = particleSystem.mNumDiffuseBuffers;

	const PxU32 bufferIndex = blockIdx.y;
	if (bufferIndex < numDiffuseBuffers)
	{
		PxgParticleDiffuseSimBuffer& diffuseBuffer = particleSystem.mDiffuseSimBuffers[bufferIndex];

		int* numDiffuseParticles = diffuseBuffer.mNumDiffuseParticles;
		const PxU32 numDiffuse = PxMin(PxI32(diffuseBuffer.mMaxNumParticles), numDiffuseParticles[1]);
		*diffuseBuffer.mNumActiveDiffuseParticles = numDiffuse; //pinned memory
		numDiffuseParticles[0] = numDiffuse;
		numDiffuseParticles[1] = 0;

	}
}


extern "C" __global__ void ps_diffuseParticleSum(
	PxgParticleSystem * PX_RESTRICT	particleSystems,
	const PxU32* const PX_RESTRICT	activeParticleSystems,
	const PxU32 count)
{
	const PxU32 id = activeParticleSystems[blockIdx.x];
	PxgParticleSystem& particleSystem = particleSystems[id];

	const PxU32 numDiffuseBuffers = particleSystem.mNumDiffuseBuffers;

	PxU32 totalDiffuse = 0;
	for (PxU32 i = threadIdx.x; i < numDiffuseBuffers; i += WARP_SIZE)
	{
		PxgParticleDiffuseSimBuffer& diffuseBuffer = particleSystem.mDiffuseSimBuffers[i];
		totalDiffuse += diffuseBuffer.mNumDiffuseParticles[0];
	}

	totalDiffuse = warpReduction<AddOpPxU32, PxU32>(FULL_MASK, totalDiffuse);


	if(threadIdx.x == 0)
	{
		*particleSystem.mNumDiffuseParticles = totalDiffuse;
	}
}
feat(physics): wire physx sdk into build 2026-04-15 12:22:15 +08:00			`// Redistribution and use in source and binary forms, with or without`
			`// modification, are permitted provided that the following conditions`
			`// are met:`
			`// * Redistributions of source code must retain the above copyright`
			`// notice, this list of conditions and the following disclaimer.`
			`// * Redistributions in binary form must reproduce the above copyright`
			`// notice, this list of conditions and the following disclaimer in the`
			`// documentation and/or other materials provided with the distribution.`
			`// * Neither the name of NVIDIA CORPORATION nor the names of its`
			`// contributors may be used to endorse or promote products derived`
			`// from this software without specific prior written permission.`
			`//`
			`// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY`
			`// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR`
			`// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR`
			`// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,`
			`// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,`
			`// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR`
			`// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY`
			`// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`//`
			`// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.`
			`// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.`
			`// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.`


			`#include "vector_types.h"`
			`#include "foundation/PxVec3.h"`
			`#include "foundation/PxVec4.h"`
			`#include "foundation/PxBounds3.h"`
			`#include "PxgParticleSystemCore.h"`
			`#include "PxgParticleSystem.h"`
			`#include "PxgParticleSystemCoreKernelIndices.h"`
			`#include "PxgBodySim.h"`
			`#include "PxgCommonDefines.h"`
			`#include "reduction.cuh"`
			`#include "shuffle.cuh"`
			`#include "stdio.h"`
			`#include "PxgSolverBody.h"`
			`#include "PxgSolverCoreDesc.h"`
			`#include "PxParticleSystem.h"`
			`#include "assert.h"`
			`#include "copy.cuh"`
			`#include "PxgSimulationCoreDesc.h"`
			`#include "gridCal.cuh"`
			`#include "particleSystem.cuh"`
			`#include "atomic.cuh"`
			`#include "utils.cuh"`

			`using namespace physx;`

			`// simpler kernel for diffuse weighting`
			`__device__ inline PxReal WDiffuse(const PxReal h, const PxReal invR)`
			`{`
			`return (1.0f - h * invR);`
			`}`

			`extern "C" __host__ void initDiffuseParticlesKernels0() {}`

			`extern "C" __global__ void ps_updateUnsortedDiffuseArrayLaunch(`
			`const PxgParticleSystem * PX_RESTRICT particleSystems,`
			`const PxU32 * PX_RESTRICT activeParticleSystems)`
			`{`
			`const PxU32 particleId = activeParticleSystems[blockIdx.z];`

			`const PxgParticleSystem& particleSystem = particleSystems[particleId];`

			`const PxU32 bufferIndex = blockIdx.y;`

			`if (bufferIndex < particleSystem.mNumDiffuseBuffers)`
			`{`
			`const PxU32 threadIndexInWarp = threadIdx.x & 31;`

			`float4* PX_RESTRICT unsortedPositions = reinterpret_cast<float4*>(particleSystem.mDiffusePosition_LifeTime);`
			`float4* PX_RESTRICT unsortedVels = reinterpret_cast<float4*>(particleSystem.mDiffuseVelocity);`

			`PxU32 localSum = 0;`

			`for (PxU32 i = threadIndexInWarp; i < bufferIndex; i += WARP_SIZE)`
			`{`
			`localSum += particleSystem.mDiffuseSimBuffers[i].mNumDiffuseParticles[0];`
			`}`

			`PxU32 bufferOffset = warpReduction<AddOpPxU32, PxU32>(FULL_MASK, localSum);`

			`PxgParticleDiffuseSimBuffer& buffer = particleSystem.mDiffuseSimBuffers[bufferIndex];`

			`int numDiffuseParticles = buffer.mNumDiffuseParticles[0];`

			`const float4* particles = buffer.mDiffusePositions_LifeTime;`
			`const float4* vels = buffer.mDiffuseVelocities;`

			`const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;`
			`if (globalThreadIndex >= numDiffuseParticles)`
			`return;`

			`if (globalThreadIndex == 0)`
			`{`
			`buffer.mStartIndex = bufferOffset;`
			`}`

			`const PxU32 ind = bufferOffset + globalThreadIndex;`
			`unsortedPositions[ind] = particles[globalThreadIndex];`
			`unsortedVels[ind] = vels[globalThreadIndex];`
			`}`
			`}`

			`extern "C" __global__ void ps_diffuseParticleOneWayCollision(`
			`PxgParticleSystem * PX_RESTRICT particleSystems,`
			`const PxU32* PX_RESTRICT activeParticleSystems,`
			`const PxU32 count`
			`)`
			`{`
			`__shared__ __align__(16) PxU8 particleSystemMemory[sizeof(PxgParticleSystem)];`
			`PxgParticleSystem& shParticleSystem = (reinterpret_cast<PxgParticleSystem>(particleSystemMemory));`

			`const PxU32 id = activeParticleSystems[blockIdx.y];`

			`const PxgParticleSystem& particleSystem = particleSystems[id];`

			`const uint2* sParticleSystem = reinterpret_cast<const uint2*>(&particleSystem);`
			`uint2* dParticleSystem = reinterpret_cast<uint2*>(&shParticleSystem);`

			`blockCopy<uint2>(dParticleSystem, sParticleSystem, sizeof(PxgParticleSystem));`
			`__syncthreads();`

			`const PxU32 pi = threadIdx.x + blockIdx.x * blockDim.x;`

			`const PxU32 numParticles = *shParticleSystem.mNumDiffuseParticles;`


			`if (pi >= numParticles)`
			`return;`

			`float4* PX_RESTRICT newPos = reinterpret_cast<float4*>(shParticleSystem.mDiffuseSortedPos_LifeTime);`

			`const PxgParticleContactInfo* PX_RESTRICT contacts = shParticleSystem.mDiffuseOneWayContactInfos;`
			`const PxU32* PX_RESTRICT contactCounts = shParticleSystem.mDiffuseOneWayContactCount;`

			`const PxU32 contactCount = PxMin(PxgParticleContactInfo::MaxStaticContactsPerParticle, contactCounts[pi]);`


			`if (contactCount)`
			`{`
			`PxVec3 posCorr = PxLoad3(newPos[pi]);`
			`for (PxU32 c = 0, offset = pi; c < contactCount; ++c, offset += numParticles)`
			`{`
			`const PxgParticleContactInfo& contact = contacts[offset];`

			`const PxVec3 surfaceNormal = PxLoad3(contact.mNormal_PenW);`

			`const PxVec3 deltaP = -surfaceNormal * contact.mNormal_PenW.w;`
			`posCorr += deltaP;`
			`}`

			`newPos[pi] = make_float4(posCorr.x, posCorr.y, posCorr.z, newPos[pi].w);`
			`}`
			`}`


			`extern "C" __global__ void ps_diffuseParticleUpdatePBF(`
			`PxgParticleSystem* PX_RESTRICT particleSystems,`
			`const PxU32* activeParticleSystems,`
			`const PxVec3 gravity,`
			`const PxReal dt)`
			`{`
			`__shared__ __align__(16) PxU8 particleSystemMemory[sizeof(PxgParticleSystem)];`
			`PxgParticleSystem& shParticleSystem = (reinterpret_cast<PxgParticleSystem>(particleSystemMemory));`

			`__shared__ int offset[3];`

			`if (threadIdx.x == 0)`
			`{`
			`offset[0] = 0; offset[1] = -1; offset[2] = 1;`
			`}`

			`const PxU32 id = activeParticleSystems[blockIdx.y];`

			`const PxgParticleSystem& particleSystem = particleSystems[id];`

			`const uint2* sParticleSystem = reinterpret_cast<const uint2*>(&particleSystem);`
			`uint2* dParticleSystem = reinterpret_cast<uint2*>(&shParticleSystem);`

			`blockCopy<uint2>(dParticleSystem, sParticleSystem, sizeof(PxgParticleSystem));`
			`__syncthreads();`

			`{`
			`int numDiffuse = *shParticleSystem.mNumDiffuseParticles;`

			`const PxU32 pi = threadIdx.x + blockIdx.x * blockDim.x;`

			`if (pi >= numDiffuse)`
			`return;`

			`const PxU32* const PX_RESTRICT cellStarts = shParticleSystem.mCellStart;`
			`const PxU32* const PX_RESTRICT cellEnds = shParticleSystem.mCellEnd;`

			`// per-particle data`
			`const float4* const PX_RESTRICT sortedPose = reinterpret_cast<float4*>(shParticleSystem.mSortedPositions_InvMass);`
			`const float4* const PX_RESTRICT sortedVel = reinterpret_cast<float4*>(shParticleSystem.mSortedVelocities);`

			`float4* PX_RESTRICT diffusePositions = reinterpret_cast<float4*>(shParticleSystem.mDiffuseSortedPos_LifeTime);`

			`//Overloading this buffer to store the new velocity...`
			`float4* PX_RESTRICT newVel = reinterpret_cast<float4*>(shParticleSystem.mDiffuseSortedOriginPos_LifeTime);`

			`// get elements`
			`const float4 xi4 = diffusePositions[pi];`
			`const PxVec3 pos = PxLoad3(xi4);`

			`// interpolate`
			`PxVec3 velAvg(PxZero);`
			`PxU32 numNeighbors = 0;`

			`const PxReal cellWidth = shParticleSystem.mCommonData.mGridCellWidth;`
			`const PxReal contactDistanceSq = shParticleSystem.mCommonData.mParticleContactDistanceSq;`
			`const PxReal invContactDistance = shParticleSystem.mCommonData.mParticleContactDistanceInv;`
			`const int3 gridPos = calcGridPos(xi4, cellWidth);`
			`const uint3 gridSize = make_uint3(shParticleSystem.mCommonData.mGridSizeX, shParticleSystem.mCommonData.mGridSizeY, shParticleSystem.mCommonData.mGridSizeZ);`

			`// Iterate over cell`
			`PxReal weightSum = 0.0f;`
			`PxVec3 velocitySum(0.f);`

			`const PxU32 maxNeighbors = 16;`

			`const PxU32 end = (shParticleSystem.mData.mFlags & PxParticleFlag::eFULL_DIFFUSE_ADVECTION) ? 3 : 1;`


			`for (int z = 0; z < end; ++z)`
			`for (int y = 0; y < end; ++y)`
			`for (int x = 0; x < end; ++x)`
			`{`
			`const int3 neighbourPos = make_int3(gridPos.x + offset[x], gridPos.y + offset[y], gridPos.z + offset[z]);`
			`const PxU32 gridHash = calcGridHash(neighbourPos, gridSize);`
			`const PxU32 startIndex = cellStarts[gridHash];`

			`if (startIndex != EMPTY_CELL)`
			`{`
			`const PxU32 endIndex = cellEnds[gridHash];`
			`for (PxU32 q = startIndex; q < endIndex; ++q)`
			`{`
			`const PxVec3 xj = PxLoad3(sortedPose[q]);`
			`const PxVec3 xij = pos - xj;`

			`const PxReal dSq = xij.dot(xij);`

			`if (dSq < contactDistanceSq)`
			`{`
			`const PxVec3 vj = PxLoad3(sortedVel[q]);`
			`const PxReal w = WDiffuse(sqrtf(dSq), invContactDistance);`

			`weightSum += w;`
			`velocitySum += vj * w;`

			`++numNeighbors;`
			`if (numNeighbors == maxNeighbors)`
			`goto weight_sum;`
			`}`
			`}`
			`}`
			`}`

			`weight_sum:`
			`if (weightSum > 0)`
			`velAvg = velocitySum / weightSum;`

			`newVel[pi] = make_float4(velAvg.x, velAvg.y, velAvg.z, PxReal(numNeighbors));`
			`}`
			`}`

			`extern "C" __global__ void ps_diffuseParticleCompact(`
			`PxgParticleSystem* PX_RESTRICT particleSystems,`
			`const PxU32* activeParticleSystems,`
			`const PxVec3 gravity,`
			`const PxReal dt)`
			`{`
			`__shared__ __align__(16) PxU8 particleSystemMemory[sizeof(PxgParticleSystem)];`
			`PxgParticleSystem& shParticleSystem = (reinterpret_cast<PxgParticleSystem>(particleSystemMemory));`

			`const PxU32 id = activeParticleSystems[blockIdx.z];`

			`const PxgParticleSystem& particleSystem = particleSystems[id];`

			`const uint2* sParticleSystem = reinterpret_cast<const uint2*>(&particleSystem);`
			`uint2* dParticleSystem = reinterpret_cast<uint2*>(&shParticleSystem);`

			`blockCopy<uint2>(dParticleSystem, sParticleSystem, sizeof(PxgParticleSystem));`
			`__syncthreads();`

			`const PxU32 bufferIndex = blockIdx.y;`
			`if (bufferIndex < shParticleSystem.mNumDiffuseBuffers)`
			`{`

			`PxgParticleDiffuseSimBuffer& buffer = shParticleSystem.mDiffuseSimBuffers[bufferIndex];`

			`int* numDiffuseParticles = buffer.mNumDiffuseParticles;`
			`int numDiffuse = numDiffuseParticles[0];`

			`const PxU32 pi = threadIdx.x + blockIdx.x * blockDim.x;`
			`const PxU32 threadIndexInWarp = threadIdx.x & 31;`

			`if (pi >= numDiffuse)`
			`return;`

			`float4* PX_RESTRICT diffusePositionsNew = buffer.mDiffusePositions_LifeTime;`
			`float4* PX_RESTRICT diffuseVelocitiesNew = buffer.mDiffuseVelocities;`

			`float4* PX_RESTRICT velAvgs = reinterpret_cast<float4*>(shParticleSystem.mDiffuseSortedOriginPos_LifeTime);`

			`float4* PX_RESTRICT diffusePositions = reinterpret_cast<float4*>(shParticleSystem.mDiffuseSortedPos_LifeTime);`
			`float4* PX_RESTRICT diffusePositionsOld = reinterpret_cast<float4*>(shParticleSystem.mDiffuseOriginPos_LifeTime);`

			`const PxU32* reverseLookup = shParticleSystem.mDiffuseUnsortedToSortedMapping;`

			`const PxU32 index = pi + buffer.mStartIndex;`
			`const PxU32 sortedInd = reverseLookup[index];`

			`// get elements`
			`const float4 xi4 = diffusePositions[sortedInd];`
			`const float4 vi4Old = diffusePositionsOld[index];`
			`const float4 xiva4 = velAvgs[sortedInd];`
			`const PxVec3 pos = PxLoad3(xi4);`
			`const PxVec3 oldPos = PxLoad3(vi4Old);`
			`const PxVec3 velAvg = PxLoad3(xiva4);`

			`const PxReal lifeDelta = dt;`

			`PxVec3 vel = (pos - oldPos)*(1.f / dt);`

			`// integrate diffuse particle`
			`PxVec3 newVel;`
			`if (xiva4.w < 4.f)`
			`{`
			`// spray (ballistic)`
			`newVel = vel * (1.0f - buffer.mParams.airDrag * dt);`
			`}`
			`else if (xiva4.w < 8.f)`
			`{`
			`// foam`
			`newVel = velAvg;`
			`}`
			`else`
			`{`
			`// bubble`
			`newVel = vel - (1.f + buffer.mParams.buoyancy) * gravity * dt + buffer.mParams.bubbleDrag * (velAvg - vel);`
			`}`

			`const float maxVel = shParticleSystem.mData.mMaxVelocity;`
			`if (newVel.magnitudeSquared() > 0)`
			`{`
			`newVel = PxMin(newVel.magnitude(), maxVel) * newVel.getNormalized();`
			`}`

			`PxVec3 newPosCorr = pos + (newVel - vel) * dt;`
			`PxVec3 newVelCorr = newVel;`

			`__syncwarp();`

			`const PxReal lifeTime = fmaxf(xi4.w - lifeDelta, 0.0f);`

			`PxU32 res = __ballot_sync(FULL_MASK, lifeTime > 0.f);`

			`PxU32 offset = 0;`

			`if (threadIndexInWarp == 0)`
			`offset = atomicAdd(&numDiffuseParticles[1], __popc(res));`

			`offset = __shfl_sync(FULL_MASK, offset, 0);`



			`if (lifeTime > 0.f)`
			`{`
			`PxU32 newIndex = offset + warpScanExclusive(res, threadIndexInWarp);`

			`diffusePositionsNew[newIndex] = make_float4(newPosCorr.x, newPosCorr.y, newPosCorr.z, lifeTime);`
			`diffuseVelocitiesNew[newIndex] = make_float4(newVelCorr.x, newVelCorr.y, newVelCorr.z, 0.0f);`
			`}`
			`}`
			`}`

			`extern "C" __global__ void ps_diffuseParticleCreate(`
			`PxgParticleSystem * PX_RESTRICT particleSystems,`
			`const PxU32* const PX_RESTRICT activeParticleSystems,`
			`const PxReal* const PX_RESTRICT randomTable,`
			`const PxU32 randomTableSize,`
			`const PxReal dt)`
			`{`
			`__shared__ __align__(16) PxU8 particleSystemMemory[sizeof(PxgParticleSystem)];`
			`PxgParticleSystem& shParticleSystem = (reinterpret_cast<PxgParticleSystem>(particleSystemMemory));`

			`const PxU32 id = activeParticleSystems[blockIdx.z];`
			`const PxgParticleSystem& particleSystem = particleSystems[id];`

			`const uint2* sParticleSystem = reinterpret_cast<const uint2*>(&particleSystem);`
			`uint2* dParticleSystem = reinterpret_cast<uint2*>(&shParticleSystem);`

			`blockCopy<uint2>(dParticleSystem, sParticleSystem, sizeof(PxgParticleSystem));`
			`__syncthreads();`

			`const PxU32 bufferIndex = blockIdx.y;`
			`if (bufferIndex < shParticleSystem.mCommonData.mNumParticleBuffers)`
			`{`

			`const PxgParticleSimBuffer& buffer = shParticleSystem.mParticleSimBuffers[bufferIndex];`

			`const PxU32 diffuseParticleBufferIndex = buffer.mDiffuseParticleBufferIndex;`

			`if (diffuseParticleBufferIndex == 0xffffffff)`
			`return;`

			`const PxgParticleSystemData& data = shParticleSystem.mData;`

			`const PxU32 pi = threadIdx.x + blockIdx.x * blockDim.x;`

			`const PxU32 numParticles = buffer.mNumActiveParticles;`

			`if (pi >= numParticles)`
			`return;`


			`PxgParticleDiffuseSimBuffer& diffuseBuffer = shParticleSystem.mDiffuseSimBuffers[diffuseParticleBufferIndex];`

			`if (diffuseBuffer.mMaxNumParticles == 0)`
			`return;`

			`// get arrays`
			`const float4* const PX_RESTRICT sortedPose = reinterpret_cast<float4*>(shParticleSystem.mSortedPositions_InvMass);`
			`const float4* const PX_RESTRICT sortedVel = reinterpret_cast<float4*>(shParticleSystem.mSortedVelocities);`
			`const PxU32* PX_RESTRICT phases = shParticleSystem.mSortedPhaseArray;`
			`const float2* const PX_RESTRICT potentials = reinterpret_cast<float2*>(shParticleSystem.mDiffusePotentials);`

			`float4* PX_RESTRICT diffusePositionsNew = diffuseBuffer.mDiffusePositions_LifeTime;`
			`float4* PX_RESTRICT diffuseVelocitiesNew = diffuseBuffer.mDiffuseVelocities;`

			`int* numDiffuseParticles = diffuseBuffer.mNumDiffuseParticles;`

			`const PxU32* reverseLookup = shParticleSystem.mUnsortedToSortedMapping;`
			`const PxU32 offset = particleSystem.mParticleBufferRunsum[bufferIndex];`

			`const PxU32 sortedInd = reverseLookup[pi + offset];`
			`// get elements`
			`const float2 ptnts = potentials[sortedInd];`
			`const PxReal threshold = diffuseBuffer.mParams.threshold;`
			`const PxU32 phase = phases[sortedInd];`

			`if (!PxGetFluid(phase))`
			`return;`

			`const float4 vi4 = sortedVel[sortedInd];`

			`//Kinetic energy + pressure`
			`const PxReal kineticEnergy = dot3(vi4, vi4) * diffuseBuffer.mParams.kineticEnergyWeight;`
			`const PxReal divergence = diffuseBuffer.mParams.divergenceWeight * ptnts.x;`
			`const PxReal pressure = diffuseBuffer.mParams.pressureWeight * ptnts.y;`
			`PxReal intensity = pressure - divergence + kineticEnergy;`

			`//if (pi == 0)`
			`// printf("numParticles %i diffuseParticleBufferIndex %i numDiffuseParticles[1] %i threshold %f\n", numParticles, diffuseParticleBufferIndex, numDiffuseParticles[1], threshold);`

			`const PxReal r0 = randomTable[(sortedInd + 0) % randomTableSize];`

			`if(r0 * intensity > threshold)`
			`{`
			`const float4 xi4 = sortedPose[sortedInd];`


			`//for (int i=0; i < 5; ++i)`
			`{`
			`// try and allocate new diffuse particles`
			`const int newIndex = atomicAdd(&numDiffuseParticles[1], 1);`

			`if (newIndex < diffuseBuffer.mMaxNumParticles)`
			`{`

			`const PxVec3 xi = PxLoad3(xi4);`
			`const PxVec3 vi = PxLoad3(vi4);`

			`const PxReal r1 = randomTable[(sortedInd + 1) % randomTableSize];`
			`const PxReal r2 = randomTable[(sortedInd + 2) % randomTableSize];`
			`const PxReal r3 = randomTable[(sortedInd + 3) % randomTableSize];`

			`const PxReal lifeMin = 1.0f;`
			`const PxReal lifeMax = diffuseBuffer.mParams.lifetime;`
			`const PxReal lifeScale = fminf(intensity / threshold, 1.f) * r1;`
			`const PxReal lifetime = lifeMin + lifeScale * (lifeMax - lifeMin);`

			`const PxVec3 q = xi - r2 * vi * dt + PxVec3(r1, r2, r3) * data.mRestOffset * 0.25f;`

			`diffusePositionsNew[newIndex] = make_float4(q.x, q.y, q.z, lifetime);`
			`diffuseVelocitiesNew[newIndex] = make_float4(vi.x, vi.y, vi.z, 0.0f);`
			`}`
			`}`
			`}`
			`}`
			`}`


			`extern "C" __global__ void ps_diffuseParticleCopy(`
			`PxgParticleSystem * PX_RESTRICT particleSystems,`
			`const PxU32* const PX_RESTRICT activeParticleSystems,`
			`const PxU32 count)`
			`{`
			`const PxU32 id = activeParticleSystems[blockIdx.z];`
			`PxgParticleSystem& particleSystem = particleSystems[id];`

			`const PxU32 numDiffuseBuffers = particleSystem.mNumDiffuseBuffers;`

			`const PxU32 bufferIndex = blockIdx.y;`
			`if (bufferIndex < numDiffuseBuffers)`
			`{`
			`PxgParticleDiffuseSimBuffer& diffuseBuffer = particleSystem.mDiffuseSimBuffers[bufferIndex];`

			`int* numDiffuseParticles = diffuseBuffer.mNumDiffuseParticles;`
			`const PxU32 numDiffuse = PxMin(PxI32(diffuseBuffer.mMaxNumParticles), numDiffuseParticles[1]);`
			`*diffuseBuffer.mNumActiveDiffuseParticles = numDiffuse; //pinned memory`
			`numDiffuseParticles[0] = numDiffuse;`
			`numDiffuseParticles[1] = 0;`

			`}`
			`}`


			`extern "C" __global__ void ps_diffuseParticleSum(`
			`PxgParticleSystem * PX_RESTRICT particleSystems,`
			`const PxU32* const PX_RESTRICT activeParticleSystems,`
			`const PxU32 count)`
			`{`
			`const PxU32 id = activeParticleSystems[blockIdx.x];`
			`PxgParticleSystem& particleSystem = particleSystems[id];`

			`const PxU32 numDiffuseBuffers = particleSystem.mNumDiffuseBuffers;`

			`PxU32 totalDiffuse = 0;`
			`for (PxU32 i = threadIdx.x; i < numDiffuseBuffers; i += WARP_SIZE)`
			`{`
			`PxgParticleDiffuseSimBuffer& diffuseBuffer = particleSystem.mDiffuseSimBuffers[i];`
			`totalDiffuse += diffuseBuffer.mNumDiffuseParticles[0];`
			`}`

			`totalDiffuse = warpReduction<AddOpPxU32, PxU32>(FULL_MASK, totalDiffuse);`


			`if(threadIdx.x == 0)`
			`{`
			`*particleSystem.mNumDiffuseParticles = totalDiffuse;`
			`}`
			`}`