// Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved. // Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. // Copyright (c) 2001-2004 NovodeX AG. All rights reserved. #include "vector_types.h" #include "foundation/PxVec3.h" #include "foundation/PxVec4.h" #include "matrixDecomposition.cuh" #include "PxParticleGpu.h" #include "PxgAnisotropyData.h" #include "sparseGridStandalone.cuh" #define ENABLE_KERNEL_LAUNCH_ERROR_CHECK 0 extern "C" __host__ void initAnisotropyKernels0() {} __device__ inline PxVec3 PxLoad3(const float4& v) { float4 tmp = v; return PxVec3(tmp.x, tmp.y, tmp.z); } __device__ inline PxVec4 PxLoad4(const float4& v) { float4 tmp = v; return PxVec4(tmp.x, tmp.y, tmp.z, tmp.w); } __device__ inline PxReal cube(PxReal x) { return x * x * x; } __device__ inline PxReal Wa(PxReal x, PxReal invr) { return 1.f - cube(x*invr); } template __device__ inline V Lerp(const V& start, const V& end, const T& t) { return start + (end - start) * t; } template __device__ inline V Clamp(const V& a, const T s, const T t) { return V(PxMin(t, PxMax(s, a[0])), PxMin(t, PxMax(s, a[1])), PxMin(t, PxMax(s, a[2]))); } // //extern "C" __global__ void smoothPositionsLaunch(PxU32* sortedOrder, PxU32* cellEnds, PxU float4* pose, PxU32* phase, float particleContactDistance) //{ // const float4* const PX_RESTRICT sortedPose = reinterpret_cast(particleSystem.mSortedPositions_InvMass); // const PxU32* const PX_RESTRICT sortedPhases = particleSystem.mSortedPhaseArray; // // const PxU32* const PX_RESTRICT cellStart = particleSystem.mCellStart; // const PxU32* const PX_RESTRICT cellEnd = particleSystem.mCellEnd; // // const PxReal particleContactDistanceSq = particleContactDistance * particleContactDistance; // const PxReal particleContactDistanceInv = 1.0f / particleContactDistance; // // // calculated the sum of weights and weighted avg position for particle neighborhood // PxVec3 xs(0.0f); //sum of positions // PxReal ws = 0.0f; //sum of weights // for (int z = -1; z <= 1; z++) // { // for (int y = -1; y <= 1; y++) // { // for (int x = -1; x <= 1; x++) // { // int3 neighbourPos = make_int3(gridPos.x + x, gridPos.y + y, gridPos.z + z); // PxU32 gridHash = calcGridHash(neighbourPos, gridSize); // PxU32 startIndex = cellStart[gridHash]; // PxU32 endIndex = cellEnd[gridHash]; // // if (startIndex != EMPTY_CELL) // { // PxU32 nextPhase = sortedPhases[startIndex]; // float4 nextPos = fetch(&sortedPose[startIndex]); // // for (PxU32 particleIndex1 = startIndex; particleIndex1 < endIndex /*&& numCollidedParticles < maxNeighborhood*/; particleIndex1++) // { // const PxU32 phase2 = nextPhase; // const float4 pos2 = nextPos; // // if ((particleIndex1 + 1) < endIndex) // { // nextPhase = sortedPhases[particleIndex1 + 1]; // nextPos = fetch(&sortedPose[particleIndex1 + 1]); // } // // if (phase2 & validPhaseMask) // { // const PxVec3 xj = PxLoad3(pos2); // const PxVec3 xij = xi - xj; // // const PxReal dsq = xij.magnitudeSquared(); // // if (0.0f < dsq && dsq < particleContactDistanceSq) // { // const PxReal w = Wa(sqrtf(dsq), particleContactDistanceInv); // ws += w; // xs += xj * w; // } // } // } // } // } // } // } // // if (ws > 0.f) // { // PxReal f = 4.0f*Wa(particleContactDistance*0.5f, particleContactDistanceInv); // PxReal smooth = PxMin(1.0f, ws / f)*smoothing; // xs /= ws; // xi = Lerp(xi, xs, smooth); // } // // //write smoothed positions back in API order // smoothedPositions[origIdx] = make_float4(xi.x, xi.y, xi.z, xi4.w); //} // Smooths particle positions in a fluid by moving them closer to the weighted // average position of their local neighborhood extern "C" __global__ void smoothPositionsLaunch(PxGpuParticleSystem* particleSystems, const PxU32 id, PxSmoothedPositionData* smoothingDataPerParticleSystem) { PxGpuParticleSystem& particleSystem = particleSystems[id]; const PxU32 numParticles = particleSystem.mCommonData.mNumParticles; PxSmoothedPositionData& data = smoothingDataPerParticleSystem[id]; const PxReal smoothing = data.mSmoothing; //pointers to global memory buffers -- inputs const float4* PX_RESTRICT sortedNewPositions = particleSystem.mSortedPositions_InvMass; const PxU32* PX_RESTRICT phases = particleSystem.mSortedPhaseArray; const PxU32* PX_RESTRICT collisionIndex = particleSystem.mCollisionIndex; const PxU32* PX_RESTRICT gridParticleIndices = particleSystem.mSortedToUnsortedMapping; //pointers to smoothed position buffers -- outputs float4* PX_RESTRICT smoothPosOrig = reinterpret_cast(data.mPositions); // particleSystem.mSmoothedOriginPos_InvMass; const PxU32 globalThreadIdx = blockIdx.x * blockDim.x + threadIdx.x; if (globalThreadIdx >= numParticles) return; //skip thread if it's past the end of particle list const PxU32 p = globalThreadIdx; //index of particle in sorted order const PxU32 origIdx = gridParticleIndices[globalThreadIdx]; const PxVec4 xi4 = PxLoad4(sortedNewPositions[p]); //ignore smoothing for non-fluid particles & write original position if (!PxGetFluid(phases[p])) { PxVec4 x = PxLoad4(sortedNewPositions[p]); smoothPosOrig[origIdx] = make_float4(x.x, x.y, x.z, x.w); return; } PxVec3 xi = PxVec3(xi4.x, xi4.y, xi4.z); PxU32 contactCount = particleSystem.mParticleSelfCollisionCount[p]; // calculated the sum of weights and weighted avg position for particle neighborhood PxVec3 xs(0.0f); //sum of positions PxReal ws = 0.0f; //sum of weights for (PxU32 i = 0, offset = p; i < contactCount; ++i, offset += numParticles) { const PxU32 q = collisionIndex[offset]; if (PxGetFluid(phases[q])) //ignore non-fluid particles { const PxVec3 xj = PxLoad3(sortedNewPositions[q]); const PxVec3 xij = xi - xj; const PxReal dsq = xij.magnitudeSquared(); if (0.0f < dsq && dsq < particleSystem.mCommonData.mParticleContactDistanceSq) { const PxReal w = Wa(sqrtf(dsq), particleSystem.mCommonData.mParticleContactDistanceInv); ws += w; xs += xj * w; } } } if (ws > 0.f) { PxReal f = 4.0f*Wa(particleSystem.mCommonData.mParticleContactDistance*0.5f, particleSystem.mCommonData.mParticleContactDistanceInv); PxReal smooth = PxMin(1.0f, ws / f)*smoothing; xs /= ws; xi = Lerp(xi, xs, smooth); } //write smoothed positions back in API order smoothPosOrig[origIdx] = make_float4(xi.x, xi.y, xi.z, xi4.w); } // Calculates Eigen-decomposition of the particle covariance matrix according // to "Reconstructing Surfaces of Particle-Based Fluids Using Anisotropic Kernels" extern "C" __global__ void calculateAnisotropyLaunch(PxGpuParticleSystem* particleSystems, const PxU32 id, PxAnisotropyData* anisotropyDataPerParticleSystem) { PxGpuParticleSystem& particleSystem = particleSystems[id]; const PxU32 numParticles = particleSystem.mCommonData.mNumParticles; //pointers to global memory buffers -- inputs const float4* PX_RESTRICT sortedNewPositions = particleSystem.mSortedPositions_InvMass; const PxU32* PX_RESTRICT phases = particleSystem.mSortedPhaseArray; const PxU32* PX_RESTRICT collisionIndex = particleSystem.mCollisionIndex; const PxU32* PX_RESTRICT gridParticleIndices = particleSystem.mSortedToUnsortedMapping; const PxAnisotropyData& anisotropyData = anisotropyDataPerParticleSystem[id]; float4* PX_RESTRICT q1 = reinterpret_cast(anisotropyData.mAnisotropy_q1); float4* PX_RESTRICT q2 = reinterpret_cast(anisotropyData.mAnisotropy_q2); float4* PX_RESTRICT q3 = reinterpret_cast(anisotropyData.mAnisotropy_q3); const PxU32 globalThreadIdx = blockIdx.x * blockDim.x + threadIdx.x; if (globalThreadIdx >= numParticles) return; //skip thread if it's past the end of particle list const PxU32 p = globalThreadIdx; //index of particle in sorted order const PxU32 origIdx = gridParticleIndices[globalThreadIdx]; //ignore anisotropy for non-fluid particles if (!PxGetFluid(phases[p])) { float r = anisotropyData.mAnisotropyMin * particleSystem.mCommonData.mParticleContactDistance; q1[origIdx] = make_float4(1.0f, 0.0f, 0.0f, r); q2[origIdx] = make_float4(0.0f, 1.0f, 0.0f, r); q3[origIdx] = make_float4(0.0f, 0.0f, 1.0f, r); /*if (globalThreadIdx == 0) printf("PxGetFluid\n");*/ return; } const PxVec3 xi = PxLoad3(sortedNewPositions[p]); PxU32 contactCount = particleSystem.mParticleSelfCollisionCount[p]; // calculated the sum of weights and weighted avg position for particle neighborhood PxVec3 xs(0.f); //sum of positions float ws = 0.f; //sum of weights PxU32 nextQ; PxU32 nextNextQ; PxVec4 xj4Next; PxU32 nextPhase; PxU32 offset = p; if (contactCount > 0) { nextQ = collisionIndex[offset]; xj4Next = PxLoad4(sortedNewPositions[nextQ]); nextPhase = phases[nextQ]; offset += numParticles; } if (contactCount > 1) { nextNextQ = collisionIndex[offset]; offset += numParticles; } for (PxU32 i = 0; i < contactCount; ++i, offset += numParticles) { const PxVec4 xj4 = xj4Next; const PxU32 phase2 = nextPhase; if ((i + 1) < contactCount) { xj4Next = PxLoad4(sortedNewPositions[nextNextQ]); nextPhase = phases[nextNextQ]; nextQ = nextNextQ; if ((i + 2) < contactCount) nextNextQ = collisionIndex[offset]; } if (PxGetFluid(phase2)) //ignore non-fluid particles { const PxVec3 xj(xj4.x, xj4.y, xj4.z); const PxVec3 xij = xi - xj; const PxReal dsq = xij.magnitudeSquared(); if (0.f < dsq && dsq < particleSystem.mCommonData.mParticleContactDistanceSq) { const PxReal w = Wa(sqrtf(dsq), particleSystem.mCommonData.mParticleContactDistanceInv); ws += w; xs += xj * w; } } } // set to radial and exit early in case of isolated particles if (ws == 0.0f) { float r = anisotropyData.mAnisotropyMin * particleSystem.mCommonData.mParticleContactDistance; q1[origIdx] = make_float4(1.0f, 0.0f, 0.0f, r); q2[origIdx] = make_float4(0.0f, 1.0f, 0.0f, r); q3[origIdx] = make_float4(0.0f, 0.0f, 1.0f, r); //if(globalThreadIdx==0) //printf("%i, ws\n", contactCount); return; } //compute inverse sum weight and weight the average position float invWs = 1.f / ws; xs *= invWs; PxMat33 covariance(PxVec3(0.0f), PxVec3(0.0f), PxVec3(0.0f)); offset = p; if (contactCount > 0) { nextQ = collisionIndex[offset]; xj4Next = PxLoad4(sortedNewPositions[nextQ]); nextPhase = phases[nextQ]; offset += numParticles; } if (contactCount > 1) { nextNextQ = collisionIndex[offset]; offset += numParticles; } // use weighted average position to calculate the covariance matrix for (PxU32 i = 0; i < contactCount; ++i, offset += numParticles) { const PxVec4 xj4 = xj4Next; const PxU32 phase2 = nextPhase; if ((i + 1) < contactCount) { xj4Next = PxLoad4(sortedNewPositions[nextNextQ]); nextPhase = phases[nextNextQ]; nextQ = nextNextQ; if ((i + 2) < contactCount) nextNextQ = collisionIndex[offset]; } if (PxGetFluid(phase2)) //ignore non-fluid particles { const PxVec3 xj(xj4.x, xj4.y, xj4.z); const PxVec3 xij = xi - xj; const PxReal dsq = xij.magnitudeSquared(); if (0.f < dsq && dsq < particleSystem.mCommonData.mParticleContactDistanceSq) { const PxReal w = Wa(sqrtf(dsq), particleSystem.mCommonData.mParticleContactDistanceInv); const PxVec3 xjs = xj - xs; covariance += PxMat33::outer(w*xjs, xjs); } } } covariance *= invWs; //calculate the eigen decomposition PxMat33 r; eigenDecomposition(covariance, r); //sanitize the eigen values (diagonal of covariance matrix) covariance[0][0] = max(covariance[0][0], 0.f); covariance[1][1] = max(covariance[1][1], 0.f); covariance[2][2] = max(covariance[2][2], 0.f); PxVec3 lambda(sqrtf(covariance[0][0]), sqrtf(covariance[1][1]), sqrtf(covariance[2][2])); //PxVec3 lambda(covariance[0][0], covariance[1][1], covariance[2][2]); const float ks = anisotropyData.mAnisotropy; const float kmin = anisotropyData.mAnisotropyMin * particleSystem.mCommonData.mParticleContactDistance; const float kmax = anisotropyData.mAnisotropyMax * particleSystem.mCommonData.mParticleContactDistance; lambda *= ks; lambda = Clamp(lambda, kmin, kmax); //write out the anisotropy vectors q1[origIdx] = make_float4(r.column0.x, r.column0.y, r.column0.z, lambda.x); q2[origIdx] = make_float4(r.column1.x, r.column1.y, r.column1.z, lambda.y); q3[origIdx] = make_float4(r.column2.x, r.column2.y, r.column2.z, lambda.z); } extern "C" __global__ __launch_bounds__(256, 1) void anisotropyKernel(const float4* const PX_RESTRICT deviceParticlePos, const PxU32* const PX_RESTRICT sortedToOriginalParticleIndex, const PxU32* const PX_RESTRICT sortedParticleToSubgrid, PxU32 maxNumSubgrids, const PxU32* const PX_RESTRICT subgridNeighbors, const PxU32* const PX_RESTRICT subgridEndIndices, int numParticles, PxU32* phases, PxU32 validPhaseMask, float4* q1, float4* q2, float4* q3, PxReal anisotropy, PxReal anisotropyMin, PxReal anisotropyMax, PxReal particleContactDistance) { PxI32 threadIndex = blockIdx.x * blockDim.x + threadIdx.x; if (threadIndex >= numParticles) return; PxI32 pNr = sortedToOriginalParticleIndex[threadIndex]; PxU32 subgridIndex = sortedParticleToSubgrid[threadIndex]; if (subgridIndex >= maxNumSubgrids || (phases && !(phases[pNr] & validPhaseMask))) { float r = anisotropyMin * particleContactDistance; q1[pNr] = make_float4(1.0f, 0.0f, 0.0f, r); q2[pNr] = make_float4(0.0f, 1.0f, 0.0f, r); q3[pNr] = make_float4(0.0f, 0.0f, 1.0f, r); return; } PxVec3 xi = PxLoad3(deviceParticlePos[pNr]); const PxReal particleContactDistanceSq = particleContactDistance * particleContactDistance; const PxReal particleContactDistanceInv = 1.0f / particleContactDistance; // calculated the sum of weights and weighted avg position for particle neighborhood PxVec3 xs(0.f); //sum of positions float ws = 0.f; //sum of weights for (int z = -1; z <= 1; z++) { for (int y = -1; y <= 1; y++) { for (int x = -1; x <= 1; x++) { PxU32 n = subgridNeighborOffset(subgridNeighbors, subgridIndex, x, y, z); if (n == EMPTY_SUBGRID) continue; int start = n == 0 ? 0 : subgridEndIndices[n - 1]; int end = subgridEndIndices[n]; for (int i = start; i < end; ++i) { int j = sortedToOriginalParticleIndex[i]; if (phases && !(phases[j] & validPhaseMask)) continue; PxVec3 xj = PxLoad3(deviceParticlePos[j]); const PxVec3 xij = xi - xj; const PxReal dsq = xij.magnitudeSquared(); if (0.f < dsq && dsq < particleContactDistanceSq) { const PxReal w = Wa(sqrtf(dsq), particleContactDistanceInv); ws += w; xs += xj * w; } } } } } // set to radial and exit early in case of isolated particles if (ws == 0.0f) { float r = anisotropyMin * particleContactDistance; q1[pNr] = make_float4(1.0f, 0.0f, 0.0f, r); q2[pNr] = make_float4(0.0f, 1.0f, 0.0f, r); q3[pNr] = make_float4(0.0f, 0.0f, 1.0f, r); //if(globalThreadIdx==0) //printf("%i, ws\n", contactCount); return; } //compute inverse sum weight and weight the average position float invWs = 1.f / ws; xs *= invWs; PxMat33 covariance(PxVec3(0.0f), PxVec3(0.0f), PxVec3(0.0f)); for (int z = -1; z <= 1; z++) { for (int y = -1; y <= 1; y++) { for (int x = -1; x <= 1; x++) { PxU32 n = subgridNeighborOffset(subgridNeighbors, subgridIndex, x, y, z); if (n == EMPTY_SUBGRID) continue; int start = n == 0 ? 0 : subgridEndIndices[n - 1]; int end = subgridEndIndices[n]; for (int i = start; i < end; ++i) { int j = sortedToOriginalParticleIndex[i]; if (phases && !(phases[j] & validPhaseMask)) continue; PxVec3 xj = PxLoad3(deviceParticlePos[j]); const PxVec3 xij = xi - xj; const PxReal dsq = xij.magnitudeSquared(); if (0.f < dsq && dsq < particleContactDistanceSq) { const PxReal w = Wa(sqrtf(dsq), particleContactDistanceInv); const PxVec3 xjs = xj - xs; covariance += PxMat33::outer(w*xjs, xjs); } } } } } covariance *= invWs; //calculate the eigen decomposition PxMat33 r; eigenDecomposition(covariance, r); //sanitize the eigen values (diagonal of covariance matrix) covariance[0][0] = max(covariance[0][0], 0.f); covariance[1][1] = max(covariance[1][1], 0.f); covariance[2][2] = max(covariance[2][2], 0.f); PxVec3 lambda(sqrtf(covariance[0][0]), sqrtf(covariance[1][1]), sqrtf(covariance[2][2])); //PxVec3 lambda(covariance[0][0], covariance[1][1], covariance[2][2]); const float ks = anisotropy; const float kmin = anisotropyMin * particleContactDistance; const float kmax = anisotropyMax * particleContactDistance; lambda *= ks; lambda = Clamp(lambda, kmin, kmax); //write out the anisotropy vectors q1[pNr] = make_float4(r.column0.x, r.column0.y, r.column0.z, lambda.x); q2[pNr] = make_float4(r.column1.x, r.column1.y, r.column1.z, lambda.y); q3[pNr] = make_float4(r.column2.x, r.column2.y, r.column2.z, lambda.z); } extern "C" __global__ void smoothPositionsKernel(float4* deviceParticlePos, PxU32* sortedToOriginalParticleIndex, PxU32* sortedParticleToSubgrid, PxU32 maxNumSubgrids, PxU32* subgridNeighbors, PxU32* subgridEndIndices, int numParticles, PxU32* phases, PxU32 validPhaseMask, float4* smoothPos, PxReal smoothing, PxReal particleContactDistance) { PxI32 threadIndex = blockIdx.x * blockDim.x + threadIdx.x; if (threadIndex >= numParticles) return; PxI32 pNr = sortedToOriginalParticleIndex[threadIndex]; float4 xi4 = deviceParticlePos[pNr]; PxU32 subgridIndex = sortedParticleToSubgrid[threadIndex]; if (subgridIndex >= maxNumSubgrids || (phases && !(phases[pNr] & validPhaseMask))) { smoothPos[pNr] = xi4; return; } PxVec3 xi = PxLoad3(deviceParticlePos[pNr]); const PxReal particleContactDistanceSq = particleContactDistance * particleContactDistance; const PxReal particleContactDistanceInv = 1.0f / particleContactDistance; // calculated the sum of weights and weighted avg position for particle neighborhood PxVec3 xs(0.0f); //sum of positions PxReal ws = 0.0f; //sum of weights for (int z = -1; z <= 1; z++) { for (int y = -1; y <= 1; y++) { for (int x = -1; x <= 1; x++) { PxU32 n = subgridNeighborOffset(subgridNeighbors, subgridIndex, x, y, z); if (n == EMPTY_SUBGRID) continue; int start = n == 0 ? 0 : subgridEndIndices[n - 1]; int end = subgridEndIndices[n]; for (int i = start; i < end; ++i) { int j = sortedToOriginalParticleIndex[i]; if (phases && !(phases[j] & validPhaseMask)) continue; PxVec3 xj = PxLoad3(deviceParticlePos[j]); //Now do the actual calculation const PxVec3 xij = xi - xj; const PxReal dsq = xij.magnitudeSquared(); if (0.0f < dsq && dsq < particleContactDistanceSq) { const PxReal w = Wa(sqrtf(dsq), particleContactDistanceInv); ws += w; xs += xj * w; } } } } } if (ws > 0.f) { PxReal f = 4.0f*Wa(particleContactDistance*0.5f, particleContactDistanceInv); PxReal smooth = PxMin(1.0f, ws / f)*smoothing; xs /= ws; xi = Lerp(xi, xs, smooth); } //write smoothed positions back in API order smoothPos[pNr] = make_float4(xi.x, xi.y, xi.z, xi4.w); }