feat(physics): wire physx sdk into build

This commit is contained in:
2026-04-15 12:22:15 +08:00
parent 5bf258df6d
commit 31f40e2cbb
2044 changed files with 752623 additions and 1 deletions

View File

@@ -0,0 +1,382 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef PX_ALIGNED_MAT33_H
#define PX_ALIGNED_MAT33_H
#include "vector_types.h"
#include "foundation/PxVec3.h"
#include "cutil_math.h"
#include "AlignedQuat.h"
#include "mathsExtensions.h"
#if !PX_DOXYGEN
namespace physx
{
#endif
/*!
\brief 3x3 matrix class
Some clarifications, as there have been much confusion about matrix formats etc in the past.
Short:
- Matrix have base vectors in columns (vectors are column matrices, 3x1 matrices).
- Matrix is physically stored in column major format
- Matrices are concaternated from left
Long:
Given three base vectors a, b and c the matrix is stored as
|a.x b.x c.x|
|a.y b.y c.y|
|a.z b.z c.z|
Vectors are treated as columns, so the vector v is
|x|
|y|
|z|
And matrices are applied _before_ the vector (pre-multiplication)
v' = M*v
|x'| |a.x b.x c.x| |x| |a.x*x + b.x*y + c.x*z|
|y'| = |a.y b.y c.y| * |y| = |a.y*x + b.y*y + c.y*z|
|z'| |a.z b.z c.z| |z| |a.z*x + b.z*y + c.z*z|
Physical storage and indexing:
To be compatible with popular 3d rendering APIs (read D3d and OpenGL)
the physical indexing is
|0 3 6|
|1 4 7|
|2 5 8|
index = column*3 + row
which in C++ translates to M[column][row]
The mathematical indexing is M_row,column and this is what is used for _-notation
so _12 is 1st row, second column and operator(row, column)!
*/
class PxAlignedMat33
{
public:
//! Default constructor
PX_CUDA_CALLABLE PX_FORCE_INLINE PxAlignedMat33()
{}
//! identity constructor
PX_CUDA_CALLABLE PX_INLINE PxAlignedMat33(PxIDENTITY r)
: column0(make_float4(1.0f,0.0f,0.0f, 0.f)), column1(make_float4(0.0f,1.0f,0.0f, 0.f)), column2(make_float4(0.0f,0.0f,1.0f, 0.f))
{
PX_UNUSED(r);
}
//! zero constructor
PX_CUDA_CALLABLE PX_INLINE PxAlignedMat33(PxZERO r)
: column0(make_float4(0.0f)), column1(make_float4(0.0f)), column2(make_float4(0.0f))
{
PX_UNUSED(r);
}
//! Construct from three base vectors
PX_CUDA_CALLABLE PxAlignedMat33(const float4& col0, const float4& col1, const float4& col2)
: column0(col0), column1(col1), column2(col2)
{}
//! Construct from a quaternion
explicit PX_CUDA_CALLABLE PX_FORCE_INLINE PxAlignedMat33(const PxAlignedQuat& q)
{
const PxReal x = q.q.x;
const PxReal y = q.q.y;
const PxReal z = q.q.z;
const PxReal w = q.q.w;
const PxReal x2 = x + x;
const PxReal y2 = y + y;
const PxReal z2 = z + z;
const PxReal xx = x2*x;
const PxReal yy = y2*y;
const PxReal zz = z2*z;
const PxReal xy = x2*y;
const PxReal xz = x2*z;
const PxReal xw = x2*w;
const PxReal yz = y2*z;
const PxReal yw = y2*w;
const PxReal zw = z2*w;
column0 = make_float4(1.0f - yy - zz, xy + zw, xz - yw, 0.f);
column1 = make_float4(xy - zw, 1.0f - xx - zz, yz + xw, 0.f);
column2 = make_float4(xz + yw, yz - xw, 1.0f - xx - yy, 0.f);
}
//! Copy constructor
PX_CUDA_CALLABLE PX_INLINE PxAlignedMat33(const PxAlignedMat33& other)
: column0(other.column0), column1(other.column1), column2(other.column2)
{}
//! Assignment operator
PX_CUDA_CALLABLE PX_FORCE_INLINE PxAlignedMat33& operator=(const PxAlignedMat33& other)
{
column0 = other.column0;
column1 = other.column1;
column2 = other.column2;
return *this;
}
//! Construct from diagonal, off-diagonals are zero.
PX_CUDA_CALLABLE PX_INLINE static PxAlignedMat33 createDiagonal(const PxVec3& d)
{
return PxAlignedMat33(make_float4(d.x,0.0f,0.0f, 0.f), make_float4(0.0f,d.y,0.0f, 0.f), make_float4(0.0f,0.0f,d.z, 0.f));
}
//! Get transposed matrix
PX_CUDA_CALLABLE PX_FORCE_INLINE PxAlignedMat33 getTranspose() const
{
const float4 v0(make_float4(column0.x, column1.x, column2.x, 0.f));
const float4 v1(make_float4(column0.y, column1.y, column2.y, 0.f));
const float4 v2(make_float4(column0.z, column1.z, column2.z, 0.f));
return PxAlignedMat33(v0,v1,v2);
}
//! Get determinant
PX_CUDA_CALLABLE PX_INLINE PxReal getDeterminant() const
{
return dot3(column0, cross3(column1, column2));
}
//! Unary minus
PX_CUDA_CALLABLE PX_INLINE PxAlignedMat33 operator-() const
{
return PxAlignedMat33(-column0, -column1, -column2);
}
//! Add
PX_CUDA_CALLABLE PX_INLINE PxAlignedMat33 operator+(const PxAlignedMat33& other) const
{
return PxAlignedMat33( column0+other.column0,
column1+other.column1,
column2+other.column2);
}
//! Subtract
PX_CUDA_CALLABLE PX_INLINE PxAlignedMat33 operator-(const PxAlignedMat33& other) const
{
return PxAlignedMat33( column0-other.column0,
column1-other.column1,
column2-other.column2);
}
//! Scalar multiplication
PX_CUDA_CALLABLE PX_INLINE PxAlignedMat33 operator*(PxReal scalar) const
{
return PxAlignedMat33(column0*scalar, column1*scalar, column2*scalar);
}
friend PxAlignedMat33 operator*(PxReal, const PxAlignedMat33&);
//! Matrix vector multiplication (returns 'this->transform(vec)')
PX_CUDA_CALLABLE PX_INLINE float4 operator*(const float4& vec) const
{
return transform(vec);
}
PX_CUDA_CALLABLE PX_INLINE PxVec3 operator*(const PxVec3& vec) const
{
return transform(vec);
}
// a <op>= b operators
//! Matrix multiplication
PX_CUDA_CALLABLE PX_FORCE_INLINE PxAlignedMat33 operator*(const PxAlignedMat33& other) const
{
//Rows from this <dot> columns from other
//column0 = transform(other.column0) etc
return PxAlignedMat33(transform(other.column0), transform(other.column1), transform(other.column2));
}
//! Equals-add
PX_CUDA_CALLABLE PX_INLINE PxAlignedMat33& operator+=(const PxAlignedMat33& other)
{
column0 += other.column0;
column1 += other.column1;
column2 += other.column2;
return *this;
}
//! Equals-sub
PX_CUDA_CALLABLE PX_INLINE PxAlignedMat33& operator-=(const PxAlignedMat33& other)
{
column0 -= other.column0;
column1 -= other.column1;
column2 -= other.column2;
return *this;
}
//! Equals scalar multiplication
PX_CUDA_CALLABLE PX_INLINE PxAlignedMat33& operator*=(PxReal scalar)
{
column0 *= scalar;
column1 *= scalar;
column2 *= scalar;
return *this;
}
//! Equals matrix multiplication
PX_CUDA_CALLABLE PX_INLINE PxAlignedMat33& operator*=(const PxAlignedMat33 &other)
{
*this = *this * other;
return *this;
}
//! Element access, mathematical way!
PX_CUDA_CALLABLE PX_FORCE_INLINE PxReal operator()(unsigned int row, unsigned int col) const
{
return (&((*this)[col]).x)[row];
}
//! Element access, mathematical way!
PX_CUDA_CALLABLE PX_FORCE_INLINE PxReal& operator()(unsigned int row, unsigned int col)
{
return (&((*this)[col]).x)[row];
}
// Transform etc
//! Transform vector by matrix, equal to v' = M*v
PX_CUDA_CALLABLE PX_FORCE_INLINE float4 transform(const float4& other) const
{
return column0*other.x + column1*other.y + column2*other.z;
}
PX_CUDA_CALLABLE PX_FORCE_INLINE PxVec3 transform(const PxVec3& other) const
{
return PxVec3( column0.x*other.x + column1.x*other.y + column2.x*other.z,
column0.y*other.x + column1.y*other.y + column2.y*other.z,
column0.z*other.x + column1.z*other.y + column2.z*other.z);
}
//! Transform vector by matrix transpose, v' = M^t*v
PX_CUDA_CALLABLE PX_INLINE float4 transformTranspose(const float4& other) const
{
return make_float4( dot3(column0, other),
dot3(column1, other),
dot3(column2, other),
0.f);
}
PX_CUDA_CALLABLE PX_FORCE_INLINE const PxReal* front() const
{
return &column0.x;
}
PX_CUDA_CALLABLE PX_FORCE_INLINE float4& operator[](unsigned int num) {return (&column0)[num];}
PX_CUDA_CALLABLE PX_FORCE_INLINE const float4& operator[](unsigned int num) const {return (&column0)[num];}
//Data, see above for format!
float4 column0, column1, column2; //the three base vectors
};
// implementation from PxQuat.h
PX_CUDA_CALLABLE PX_INLINE PxAlignedQuat::PxAlignedQuat(const PxAlignedMat33& m)
{
PxReal tr = m(0,0) + m(1,1) + m(2,2), h;
if(tr >= 0)
{
h = PxSqrt(tr +1);
q.w = 0.5f * h;
h = 0.5f / h;
q.x = (m(2,1) - m(1,2)) * h;
q.y = (m(0,2) - m(2,0)) * h;
q.z = (m(1,0) - m(0,1)) * h;
}
else
{
unsigned int i = 0;
if (m(1,1) > m(0,0))
i = 1;
if (m(2,2) > m(i,i))
i = 2;
switch (i)
{
case 0:
h = PxSqrt((m(0,0) - (m(1,1) + m(2,2))) + 1);
q.x = 0.5f * h;
h = 0.5f / h;
q.y = (m(0,1) + m(1,0)) * h;
q.z = (m(2,0) + m(0,2)) * h;
q.w = (m(2,1) - m(1,2)) * h;
break;
case 1:
h = PxSqrt((m(1,1) - (m(2,2) + m(0,0))) + 1);
q.y = 0.5f * h;
h = 0.5f / h;
q.z = (m(1,2) + m(2,1)) * h;
q.x = (m(0,1) + m(1,0)) * h;
q.w = (m(0,2) - m(2,0)) * h;
break;
case 2:
h = PxSqrt((m(2,2) - (m(0,0) + m(1,1))) + 1);
q.z = 0.5f * h;
h = 0.5f / h;
q.x = (m(2,0) + m(0,2)) * h;
q.y = (m(1,2) + m(2,1)) * h;
q.w = (m(1,0) - m(0,1)) * h;
break;
default: // Make compiler happy
q.x = q.y = q.z = q.w = 0;
break;
}
}
}
#if !PX_DOXYGEN
} // namespace physx
#endif
#endif // PX_FOUNDATION_PX_MAT33_H

View File

@@ -0,0 +1,407 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef PX_ALIGNED_QUAT_H
#define PX_ALIGNED_QUAT_H
#include "vector_types.h"
#include "foundation/PxVec3.h"
#include "foundation/PxQuat.h"
#include "cutil_math.h"
#include "foundation/PxQuat.h"
#include "foundation/PxAssert.h"
#if !PX_DOXYGEN
namespace physx
{
#endif
class PxAlignedMat33;
/**
\brief This is a quaternion class. For more information on quaternion mathematics
consult a mathematics source on complex numbers.
*/
PX_ALIGN_PREFIX(16)
class PxAlignedQuat
{
public:
/**
\brief Default constructor, does not do any initialization.
*/
PX_CUDA_CALLABLE PX_FORCE_INLINE PxAlignedQuat() { }
//! identity constructor
PX_CUDA_CALLABLE PX_INLINE PxAlignedQuat(PxIDENTITY r)
: q(make_float4(0.0f, 0.0f, 0.0f, 1.0f))
{
PX_UNUSED(r);
}
/**
\brief Constructor from a scalar: sets the real part w to the scalar value, and the imaginary parts (x,y,z) to zero
*/
explicit PX_CUDA_CALLABLE PX_FORCE_INLINE PxAlignedQuat(PxReal r)
: q(make_float4(0.0f, 0.0f, 0.0f, r))
{
}
/**
\brief Constructor. Take note of the order of the elements!
*/
PX_CUDA_CALLABLE PX_FORCE_INLINE PxAlignedQuat(PxReal nx, PxReal ny, PxReal nz, PxReal nw) : q(make_float4(nx, ny, nz, nw)) {}
PX_CUDA_CALLABLE PX_FORCE_INLINE PxAlignedQuat(const PxQuat& q0) : q(make_float4(q0.x, q0.y, q0.z, q0.w)) {}
/**
\brief Creates from angle-axis representation.
Axis must be normalized!
Angle is in radians!
<b>Unit:</b> Radians
*/
PX_CUDA_CALLABLE PX_INLINE PxAlignedQuat(PxReal angleRadians, const PxVec3& unitAxis)
{
PX_ASSERT(PxAbs(1.0f-unitAxis.magnitude())<1e-3f);
const PxReal a = angleRadians * 0.5f;
const PxReal s = PxSin(a);
q.w = PxCos(a);
q.x = unitAxis.x * s;
q.y = unitAxis.y * s;
q.z = unitAxis.z * s;
}
PX_CUDA_CALLABLE PX_FORCE_INLINE PxAlignedQuat(const float4 v) : q(v) {}
/**
\brief Copy ctor.
*/
PX_CUDA_CALLABLE PX_FORCE_INLINE PxAlignedQuat(const PxAlignedQuat& v): q(v.q) {}
/**
\brief Creates from orientation matrix.
\param[in] m Rotation matrix to extract quaternion from.
*/
PX_CUDA_CALLABLE PX_INLINE explicit PxAlignedQuat(const PxAlignedMat33& m); /* defined in PxAlignedMat33.h */
/**
\brief returns true if all elements are finite (not NAN or INF, etc.)
*/
PX_CUDA_CALLABLE bool isFinite() const
{
return PxIsFinite(q.x)
&& PxIsFinite(q.y)
&& PxIsFinite(q.z)
&& PxIsFinite(q.w);
}
/**
\brief returns true if finite and magnitude is close to unit
*/
PX_CUDA_CALLABLE bool isUnit() const
{
const PxReal unitTolerance = 1e-4f;
return isFinite() && PxAbs(magnitude()-1)<unitTolerance;
}
/**
\brief returns true if finite and magnitude is reasonably close to unit to allow for some accumulation of error vs isValid
*/
PX_CUDA_CALLABLE bool isSane() const
{
const PxReal unitTolerance = 1e-2f;
return isFinite() && PxAbs(magnitude()-1)<unitTolerance;
}
/**
\brief returns true if the two quaternions are exactly equal
*/
PX_CUDA_CALLABLE PX_INLINE bool operator==(const PxAlignedQuat&q2) const { return q.x == q2.q.x && q.y == q2.q.y && q.z == q2.q.z && q.w == q2.q.w; }
/**
\brief This is the squared 4D vector length, should be 1 for unit quaternions.
*/
PX_CUDA_CALLABLE PX_FORCE_INLINE PxReal magnitudeSquared() const
{
return ::dot(q,q);
}
/**
\brief returns the scalar product of this and other.
*/
PX_CUDA_CALLABLE PX_FORCE_INLINE PxReal dot(const PxAlignedQuat& v) const
{
return ::dot(q,v.q);
}
PX_CUDA_CALLABLE PX_INLINE PxAlignedQuat getNormalized() const
{
const PxReal s = 1.0f/magnitude();
return PxAlignedQuat(q.x*s, q.y*s, q.z*s, q.w*s);
}
PX_CUDA_CALLABLE PX_INLINE float magnitude() const
{
return PxSqrt(magnitudeSquared());
}
//modifiers:
/**
\brief maps to the closest unit quaternion.
*/
PX_CUDA_CALLABLE PX_INLINE PxReal normalize() // convert this PxAlignedQuat to a unit quaternion
{
const PxReal mag = magnitude();
if (mag != 0.0f)
{
const PxReal imag = 1.0f / mag;
q.x *= imag;
q.y *= imag;
q.z *= imag;
q.w *= imag;
}
return mag;
}
/*
\brief returns the conjugate.
\note for unit quaternions, this is the inverse.
*/
PX_CUDA_CALLABLE PX_INLINE PxAlignedQuat getConjugate() const
{
return PxAlignedQuat(-q.x,-q.y,-q.z,q.w);
}
/*
\brief returns imaginary part.
*/
PX_CUDA_CALLABLE PX_INLINE PxVec3 getImaginaryPart() const
{
return PxVec3(q.x,q.y,q.z);
}
/** brief computes rotation of x-axis */
PX_CUDA_CALLABLE PX_FORCE_INLINE PxVec3 getBasisVector0() const
{
const PxF32 x2 = q.x*2.0f;
const PxF32 w2 = q.w*2.0f;
return PxVec3( (q.w * w2) - 1.0f + q.x*x2,
(q.z * w2) + q.y*x2,
(-q.y * w2) + q.z*x2);
}
/** brief computes rotation of y-axis */
PX_CUDA_CALLABLE PX_FORCE_INLINE PxVec3 getBasisVector1() const
{
const PxF32 y2 = q.y*2.0f;
const PxF32 w2 = q.w*2.0f;
return PxVec3( (-q.z * w2) + q.x*y2,
(q.w * w2) - 1.0f + q.y*y2,
(q.x * w2) + q.z*y2);
}
/** brief computes rotation of z-axis */
PX_CUDA_CALLABLE PX_FORCE_INLINE PxVec3 getBasisVector2() const
{
const PxF32 z2 = q.z*2.0f;
const PxF32 w2 = q.w*2.0f;
return PxVec3( (q.y * w2) + q.x*z2,
(-q.x * w2) + q.y*z2,
(q.w * w2) - 1.0f + q.z*z2);
}
/**
rotates passed vec by this (assumed unitary)
*/
PX_CUDA_CALLABLE PX_FORCE_INLINE const PxVec3 rotate(const PxVec3& v) const
{
const PxF32 vx = 2.0f*v.x;
const PxF32 vy = 2.0f*v.y;
const PxF32 vz = 2.0f*v.z;
const PxF32 w2 = q.w*q.w-0.5f;
const PxF32 dot2 = (q.x*vx + q.y*vy +q.z*vz);
return PxVec3
(
(vx*w2 + (q.y * vz - q.z * vy)*q.w + q.x*dot2),
(vy*w2 + (q.z * vx - q.x * vz)*q.w + q.y*dot2),
(vz*w2 + (q.x * vy - q.y * vx)*q.w + q.z*dot2)
);
}
PX_CUDA_CALLABLE PX_FORCE_INLINE const float4 rotate(const float4& v) const
{
const PxF32 vx = 2.0f*v.x;
const PxF32 vy = 2.0f*v.y;
const PxF32 vz = 2.0f*v.z;
const PxF32 w2 = q.w*q.w-0.5f;
const PxF32 dot2 = (q.x*vx + q.y*vy +q.z*vz);
return make_float4
(
(vx*w2 + (q.y * vz - q.z * vy)*q.w + q.x*dot2),
(vy*w2 + (q.z * vx - q.x * vz)*q.w + q.y*dot2),
(vz*w2 + (q.x * vy - q.y * vx)*q.w + q.z*dot2),
0.f
);
}
/**
inverse rotates passed vec by this (assumed unitary)
*/
PX_CUDA_CALLABLE PX_FORCE_INLINE const PxVec3 rotateInv(const PxVec3& v) const
{
const PxF32 vx = 2.0f*v.x;
const PxF32 vy = 2.0f*v.y;
const PxF32 vz = 2.0f*v.z;
const PxF32 w2 = q.w*q.w-0.5f;
const PxF32 dot2 = (q.x*vx + q.y*vy +q.z*vz);
return PxVec3
(
(vx*w2 - (q.y * vz - q.z * vy)*q.w + q.x*dot2),
(vy*w2 - (q.z * vx - q.x * vz)*q.w + q.y*dot2),
(vz*w2 - (q.x * vy - q.y * vx)*q.w + q.z*dot2)
);
}
PX_CUDA_CALLABLE PX_FORCE_INLINE const float4 rotateInv(const float4& v) const
{
const PxF32 vx = 2.0f*v.x;
const PxF32 vy = 2.0f*v.y;
const PxF32 vz = 2.0f*v.z;
const PxF32 w2 = q.w*q.w-0.5f;
const PxF32 dot2 = (q.x*vx + q.y*vy +q.z*vz);
return make_float4
(
(vx*w2 - (q.y * vz - q.z * vy)*q.w + q.x*dot2),
(vy*w2 - (q.z * vx - q.x * vz)*q.w + q.y*dot2),
(vz*w2 - (q.x * vy - q.y * vx)*q.w + q.z*dot2),
0.f
);
}
/**
\brief Assignment operator
*/
PX_CUDA_CALLABLE PX_FORCE_INLINE PxAlignedQuat& operator=(const PxAlignedQuat& p) { q = p.q; return *this; }
PX_CUDA_CALLABLE PX_FORCE_INLINE PxAlignedQuat& operator*= (const PxAlignedQuat& q2)
{
const PxReal tx = q.w*q2.q.x + q2.q.w*q.x + q.y*q2.q.z - q2.q.y*q.z;
const PxReal ty = q.w*q2.q.y + q2.q.w*q.y + q.z*q2.q.x - q2.q.z*q.x;
const PxReal tz = q.w*q2.q.z + q2.q.w*q.z + q.x*q2.q.y - q2.q.x*q.y;
q.w = q.w*q2.q.w - q2.q.x*q.x - q.y*q2.q.y - q2.q.z*q.z;
q.x = tx;
q.y = ty;
q.z = tz;
return *this;
}
PX_CUDA_CALLABLE PX_FORCE_INLINE PxAlignedQuat& operator+= (const PxAlignedQuat& q2)
{
q += q2.q;
return *this;
}
PX_CUDA_CALLABLE PX_FORCE_INLINE PxAlignedQuat& operator-= (const PxAlignedQuat& q2)
{
q -= q2.q;
return *this;
}
PX_CUDA_CALLABLE PX_FORCE_INLINE PxAlignedQuat& operator*= (const PxReal s)
{
q *= s;
return *this;
}
/** quaternion multiplication */
PX_CUDA_CALLABLE PX_INLINE PxAlignedQuat operator*(const PxAlignedQuat& q2) const
{
return PxAlignedQuat(q.w*q2.q.x + q2.q.w*q.x + q.y*q2.q.z - q2.q.y*q.z,
q.w*q2.q.y + q2.q.w*q.y + q.z*q2.q.x - q2.q.z*q.x,
q.w*q2.q.z + q2.q.w*q.z + q.x*q2.q.y - q2.q.x*q.y,
q.w*q2.q.w - q.x*q2.q.x - q.y*q2.q.y - q.z*q2.q.z);
}
/** quaternion addition */
PX_CUDA_CALLABLE PX_FORCE_INLINE PxAlignedQuat operator+(const PxAlignedQuat& q2) const
{
return PxAlignedQuat(q + q2.q);
}
/** quaternion subtraction */
PX_CUDA_CALLABLE PX_FORCE_INLINE PxAlignedQuat operator-() const
{
return PxAlignedQuat(-q.x,-q.y,-q.z,-q.w);
}
PX_CUDA_CALLABLE PX_FORCE_INLINE PxAlignedQuat operator-(const PxAlignedQuat& q2) const
{
return PxAlignedQuat(q - q2.q);
}
PX_CUDA_CALLABLE PX_FORCE_INLINE PxAlignedQuat operator*(PxReal r) const
{
return PxAlignedQuat(q*r);
}
// TODO avoroshilov: check if it's OK
PX_CUDA_CALLABLE PX_FORCE_INLINE operator PxQuat() const
{
return PxQuat(q.x, q.y, q.z, q.w);
}
/** the quaternion elements */
float4 q;
}
PX_ALIGN_SUFFIX(16);
#if !PX_DOXYGEN
} // namespace physx
#endif
#endif // PX_FOUNDATION_PX_QUAT_H

View File

@@ -0,0 +1,254 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef PX_ALIGNED_TRANSFORM_H
#define PX_ALIGNED_TRANSFORM_H
#include "AlignedQuat.h"
#include "foundation/PxPlane.h"
#include "foundation/PxTransform.h"
namespace physx
{
class PxAlignedTransform
{
public:
PxAlignedQuat q;
float4 p;
//#define PxAlignedTransform_DEFAULT_CONSTRUCT_NAN
PX_CUDA_CALLABLE PX_FORCE_INLINE PxAlignedTransform()
#ifdef PXTRANSFORM_DEFAULT_CONSTRUCT_IDENTITY
: q(0, 0, 0, 1), p(0, 0, 0)
#elif defined(PXTRANSFORM_DEFAULT_CONSTRUCT_NAN)
#define invalid PxSqrt(-1.0f)
: q(invalid, invalid, invalid, invalid), p(invalid, invalid, invalid)
#undef invalid
#endif
{
}
PX_CUDA_CALLABLE PX_FORCE_INLINE explicit PxAlignedTransform(const float4& position): q(PxIdentity), p(position)
{
}
PX_CUDA_CALLABLE PX_FORCE_INLINE explicit PxAlignedTransform(PxIDENTITY r)
: q(PxIdentity), p(make_float4(0.f))
{
PX_UNUSED(r);
}
PX_CUDA_CALLABLE PX_FORCE_INLINE explicit PxAlignedTransform(const PxAlignedQuat& orientation): q(orientation), p(make_float4(0.f))
{
PX_ASSERT(orientation.isSane());
}
PX_CUDA_CALLABLE PX_FORCE_INLINE PxAlignedTransform(PxReal x, PxReal y, PxReal z)
: q(PxIdentity), p(make_float4(x, y, z, 0.f))
{
}
PX_CUDA_CALLABLE PX_FORCE_INLINE PxAlignedTransform(PxReal x, PxReal y, PxReal z, const PxAlignedQuat& aQ)
: q(aQ), p(make_float4(x, y, z, 0.f))
{
}
PX_CUDA_CALLABLE PX_FORCE_INLINE PxAlignedTransform(const float4& p0, const PxAlignedQuat& q0): q(q0), p(p0)
{
PX_ASSERT(q0.isSane());
}
PX_CUDA_CALLABLE PX_FORCE_INLINE PxAlignedTransform(const PxTransform& x)
{
PX_ASSERT(x.isSane());
q = PxAlignedQuat( x.q );
p = make_float4(x.p.x, x.p.y, x.p.z, 0.f);
}
PX_CUDA_CALLABLE PX_FORCE_INLINE PxTransform getTransform() const
{
return PxTransform(PxVec3(p.x, p.y, p.z), PxQuat(q.q.x, q.q.y, q.q.z, q.q.w));
}
PX_CUDA_CALLABLE PX_FORCE_INLINE PxAlignedTransform operator*(const PxAlignedTransform& x) const
{
PX_ASSERT(x.isSane());
return transform(x);
}
//! Equals matrix multiplication
PX_CUDA_CALLABLE PX_INLINE PxAlignedTransform& operator*=(PxAlignedTransform &other)
{
*this = *this * other;
return *this;
}
PX_CUDA_CALLABLE PX_FORCE_INLINE bool operator==(const PxAlignedTransform &other) const
{
return (p.x == other.p.x) && (p.y == other.p.y) && (p.z == other.p.z) && (q == other.q);
}
PX_CUDA_CALLABLE PX_FORCE_INLINE bool operator!=(const PxAlignedTransform &other) const
{
return !(*this == other);
}
PX_CUDA_CALLABLE PX_FORCE_INLINE PxAlignedTransform getInverse() const
{
PX_ASSERT(isFinite());
return PxAlignedTransform(q.rotateInv(make_float4(-p.x, -p.y, -p.z, -p.w)),q.getConjugate());
}
PX_CUDA_CALLABLE PX_FORCE_INLINE PxVec3 transform(const PxVec3& input) const
{
PX_ASSERT(isFinite());
return q.rotate(input) + PxVec3(p.x, p.y, p.z);
}
PX_CUDA_CALLABLE PX_FORCE_INLINE PxVec3 transformInv(const PxVec3& input) const
{
PX_ASSERT(isFinite());
return q.rotateInv(input-PxVec3(p.x, p.y, p.z));
}
PX_CUDA_CALLABLE PX_FORCE_INLINE float4 transform(const float4& input) const
{
PX_ASSERT(isFinite());
return q.rotate(input) + p;
}
PX_CUDA_CALLABLE PX_FORCE_INLINE float4 transformInv(const float4& input) const
{
PX_ASSERT(isFinite());
return q.rotateInv(input-p);
}
PX_CUDA_CALLABLE PX_FORCE_INLINE PxVec3 rotate(const PxVec3& input) const
{
PX_ASSERT(isFinite());
return q.rotate(input);
}
PX_CUDA_CALLABLE PX_FORCE_INLINE PxVec3 rotateInv(const PxVec3& input) const
{
PX_ASSERT(isFinite());
return q.rotateInv(input);
}
PX_CUDA_CALLABLE PX_FORCE_INLINE float4 rotate(const float4& input) const
{
PX_ASSERT(isFinite());
return q.rotate(input);
}
PX_CUDA_CALLABLE PX_FORCE_INLINE float4 rotateInv(const float4& input) const
{
PX_ASSERT(isFinite());
return q.rotateInv(input);
}
//! Transform transform to parent (returns compound transform: first src, then *this)
PX_CUDA_CALLABLE PX_FORCE_INLINE PxAlignedTransform transform(const PxAlignedTransform& src) const
{
PX_ASSERT(src.isSane());
PX_ASSERT(isSane());
// src = [srct, srcr] -> [r*srct + t, r*srcr]
return PxAlignedTransform(q.rotate(src.p) + p, q*src.q);
}
/**
\brief returns true if finite and q is a unit quaternion
*/
PX_CUDA_CALLABLE bool isValid() const
{
return PxIsFinite(p.x) && PxIsFinite(p.y) && PxIsFinite(p.z) && q.isFinite() && q.isUnit();
}
/**
\brief returns true if finite and quat magnitude is reasonably close to unit to allow for some accumulation of error vs isValid
*/
PX_CUDA_CALLABLE bool isSane() const
{
return isFinite() && q.isSane();
}
/**
\brief returns true if all elems are finite (not NAN or INF, etc.)
*/
PX_CUDA_CALLABLE PX_FORCE_INLINE bool isFinite() const { return PxIsFinite(p.x) && PxIsFinite(p.y) && PxIsFinite(p.z) && q.isFinite(); }
//! Transform transform from parent (returns compound transform: first src, then this->inverse)
PX_CUDA_CALLABLE PX_FORCE_INLINE PxAlignedTransform transformInv(const PxAlignedTransform& src) const
{
PX_ASSERT(src.isSane());
PX_ASSERT(isFinite());
// src = [srct, srcr] -> [r^-1*(srct-t), r^-1*srcr]
PxAlignedQuat qinv = q.getConjugate();
return PxAlignedTransform(qinv.rotate(src.p - p), qinv*src.q);
}
/**
\brief transform plane
*/
PX_CUDA_CALLABLE PX_FORCE_INLINE PxPlane transform(const PxPlane& plane) const
{
PxVec3 transformedNormal = rotate(plane.n);
return PxPlane(transformedNormal, plane.d - PxVec3(p.x, p.y, p.z).dot(transformedNormal));
}
/**
\brief inverse-transform plane
*/
PX_CUDA_CALLABLE PX_FORCE_INLINE PxPlane inverseTransform(const PxPlane& plane) const
{
PxVec3 transformedNormal = rotateInv(plane.n);
return PxPlane(transformedNormal, plane.d + PxVec3(p.x, p.y, p.z).dot(plane.n));
}
/**
\brief return a normalized transform (i.e. one in which the quaternion has unit magnitude)
*/
PX_CUDA_CALLABLE PX_FORCE_INLINE PxAlignedTransform getNormalized() const
{
return PxAlignedTransform(p, q.getNormalized());
}
};
}
#endif

View File

@@ -0,0 +1,291 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef PX_SPATIAL_MATRIX_H
#define PX_SPATIAL_MATRIX_H
#include "foundation/PxSimpleTypes.h"
#include "foundation/PxMat33.h"
#include "CmSpatialVector.h"
namespace physx
{
struct PxSpatialMatrix33
{
PxReal column[3][3];
};
struct PxSpatialMatrix63
{
PxReal column[6][3];
};
struct PxSpatialMatrix36
{
PxReal column[3][6];
};
//144 bytes
struct PxSpatialMatrix
{
public:
PxReal column[6][6];
PX_CUDA_CALLABLE PX_FORCE_INLINE Cm::UnAlignedSpatialVector operator * (const Cm::UnAlignedSpatialVector& s) const
{
PxReal st[6];
st[0] = s.top.x; st[1] = s.top.y; st[2] = s.top.z;
st[3] = s.bottom.x; st[4] = s.bottom.y; st[5] = s.bottom.z;
PxReal result[6];
for (PxU32 i = 0; i < 6; ++i)
{
result[i] = 0.f;
for (PxU32 j = 0; j < 6; ++j)
{
result[i] += column[j][i] * st[j];
}
}
Cm::UnAlignedSpatialVector temp;
temp.top.x = result[0]; temp.top.y = result[1]; temp.top.z = result[2];
temp.bottom.x = result[3]; temp.bottom.y = result[4]; temp.bottom.z = result[5];
return temp;
}
PX_CUDA_CALLABLE PX_FORCE_INLINE PxMat33 invertSym33(const PxMat33& in)
{
PxVec3 v0 = in[1].cross(in[2]),
v1 = in[2].cross(in[0]),
v2 = in[0].cross(in[1]);
PxReal det = v0.dot(in[0]);
if (det != 0)
{
PxReal recipDet = 1.0f / det;
return PxMat33(v0 * recipDet,
PxVec3(v0.y, v1.y, v1.z) * recipDet,
PxVec3(v0.z, v1.z, v2.z) * recipDet);
}
else
{
return PxMat33(PxIdentity);
}
}
PX_CUDA_CALLABLE void initialize(const PxMat33& inertia, const PxReal mass)
{
column[0][0] = 0.f; column[0][1] = 0.f; column[0][2] = 0.f;
column[1][0] = 0.f; column[1][1] = 0.f; column[1][2] = 0.f;
column[2][0] = 0.f; column[2][1] = 0.f; column[2][2] = 0.f;
column[0][3] = mass; column[0][4] = 0.f; column[0][5] = 0.f;
column[1][3] = 0.f; column[1][4] = mass; column[1][5] = 0.f;
column[2][3] = 0.f; column[2][4] = 0.f; column[2][5] = mass;
column[3][0] = inertia.column0.x; column[3][1] = inertia.column0.y; column[3][2] = inertia.column0.z;
column[4][0] = inertia.column1.x; column[4][1] = inertia.column1.y; column[4][2] = inertia.column1.z;
column[5][0] = inertia.column2.x; column[5][1] = inertia.column2.y; column[5][2] = inertia.column2.z;
column[3][3] = 0.f; column[3][4] = 0.f; column[3][5] = 0.f;
column[4][3] = 0.f; column[4][4] = 0.f; column[4][5] = 0.f;
column[5][3] = 0.f; column[5][4] = 0.f; column[5][5] = 0.f;
}
PX_CUDA_CALLABLE PxMat33 topLeft() const
{
return PxMat33( PxVec3(column[0][0], column[0][1], column[0][2]),
PxVec3(column[1][0], column[1][1], column[1][2]),
PxVec3(column[2][0], column[2][1], column[2][2]));
}
PX_CUDA_CALLABLE void setTopLeft(const PxMat33& m)
{
column[0][0] = m.column0.x; column[0][1] = m.column0.y; column[0][2] = m.column0.z;
column[1][0] = m.column1.x; column[1][1] = m.column1.y; column[1][2] = m.column1.z;
column[2][0] = m.column2.x; column[2][1] = m.column2.y; column[2][2] = m.column2.z;
}
PX_CUDA_CALLABLE PxMat33 bottomLeft() const
{
return PxMat33( PxVec3(column[0][3], column[0][4], column[0][5]),
PxVec3(column[1][3], column[1][4], column[1][5]),
PxVec3(column[2][3], column[2][4], column[2][5]));
}
PX_CUDA_CALLABLE void setBottomLeft(const PxMat33& m)
{
column[0][3] = m.column0.x; column[0][4] = m.column0.y; column[0][5] = m.column0.z;
column[1][3] = m.column1.x; column[1][4] = m.column1.y; column[1][5] = m.column1.z;
column[2][3] = m.column2.x; column[2][4] = m.column2.y; column[2][5] = m.column2.z;
}
PX_CUDA_CALLABLE PxMat33 topRight() const
{
return PxMat33( PxVec3(column[3][0], column[3][1], column[3][2]),
PxVec3(column[4][0], column[4][1], column[4][2]),
PxVec3(column[5][0], column[5][1], column[5][2]));
}
PX_CUDA_CALLABLE void setTopRight(const PxMat33& m)
{
column[3][0] = m.column0.x; column[3][1] = m.column0.y; column[3][2] = m.column0.z;
column[4][0] = m.column1.x; column[4][1] = m.column1.y; column[4][2] = m.column1.z;
column[5][0] = m.column2.x; column[5][1] = m.column2.y; column[5][2] = m.column2.z;
}
PX_CUDA_CALLABLE PxMat33 bottomRight() const
{
return PxMat33( PxVec3(column[3][3], column[3][4], column[3][5]),
PxVec3(column[4][3], column[4][4], column[4][5]),
PxVec3(column[5][3], column[5][4], column[5][5]));
}
PX_CUDA_CALLABLE void setBottomRight(const PxMat33& m)
{
column[3][3] = m.column0.x; column[3][4] = m.column0.y; column[3][5] = m.column0.z;
column[4][3] = m.column1.x; column[4][4] = m.column1.y; column[4][5] = m.column1.z;
column[5][3] = m.column2.x; column[5][4] = m.column2.y; column[5][5] = m.column2.z;
}
PX_CUDA_CALLABLE PX_FORCE_INLINE PxSpatialMatrix invertInertia()
{
//bottom left
PxMat33 aa( PxVec3(column[0][3], column[0][4], column[0][5]),
PxVec3(column[1][3], column[1][4], column[1][5]),
PxVec3(column[2][3], column[2][4], column[2][5]));
// top right
PxMat33 ll(PxVec3(column[3][0], column[3][1], column[3][2]),
PxVec3(column[4][0], column[4][1], column[4][2]),
PxVec3(column[5][0], column[5][1], column[5][2]));
//top left
PxMat33 la(PxVec3(column[0][0], column[0][1], column[0][2]),
PxVec3(column[1][0], column[1][1], column[1][2]),
PxVec3(column[2][0], column[2][1], column[2][2]));
aa = (aa + aa.getTranspose())*0.5f;
ll = (ll + ll.getTranspose())*0.5f;
PxMat33 AAInv = invertSym33(aa);
PxMat33 z = -la * AAInv;
PxMat33 S = ll + z * la.getTranspose(); // Schur complement of mAA
PxMat33 LL = invertSym33(S);
PxMat33 LA = LL * z;
PxMat33 AA = AAInv + z.getTranspose() * LA;
PxSpatialMatrix result;
PxMat33 topleft = LA.getTranspose();
//top left
result.column[0][0] = topleft.column0[0];
result.column[0][1] = topleft.column0[1];
result.column[0][2] = topleft.column0[2];
result.column[1][0] = topleft.column1[0];
result.column[1][1] = topleft.column1[1];
result.column[1][2] = topleft.column1[2];
result.column[2][0] = topleft.column2[0];
result.column[2][1] = topleft.column2[1];
result.column[2][2] = topleft.column2[2];
//top right
result.column[3][0] = AA.column0[0];
result.column[3][1] = AA.column0[1];
result.column[3][2] = AA.column0[2];
result.column[4][0] = AA.column1[0];
result.column[4][1] = AA.column1[1];
result.column[4][2] = AA.column1[2];
result.column[5][0] = AA.column2[0];
result.column[5][1] = AA.column2[1];
result.column[5][2] = AA.column2[2];
//bottom left
result.column[0][3] = LL.column0[0];
result.column[0][4] = LL.column0[1];
result.column[0][5] = LL.column0[2];
result.column[1][3] = LL.column1[0];
result.column[1][4] = LL.column1[1];
result.column[1][5] = LL.column1[2];
result.column[2][3] = LL.column2[0];
result.column[2][4] = LL.column2[1];
result.column[2][5] = LL.column2[2];
//bottom right
result.column[3][3] = LA.column0[0];
result.column[3][4] = LA.column0[1];
result.column[3][5] = LA.column0[2];
result.column[4][3] = LA.column1[0];
result.column[4][4] = LA.column1[1];
result.column[4][5] = LA.column1[2];
result.column[5][3] = LA.column2[0];
result.column[5][4] = LA.column2[1];
result.column[5][5] = LA.column2[2];
return result;
}
PX_CUDA_CALLABLE PX_FORCE_INLINE PxReal& operator()(PxU32 row, PxU32 col)
{
return column[col][row];
}
PX_CUDA_CALLABLE PX_FORCE_INLINE PxReal operator()(PxU32 row, PxU32 col) const
{
return column[col][row];
}
//A bit of a misuse. In featherstone articulations, we use this matrix to store
//the invMass/sqrtInvInertia of a rigid body in a way that is compatible with the
//response matrix produced by our articulation code. In order for this to work,
//we must also scale the angular term by the inertia tensor for rigid bodies.
PX_CUDA_CALLABLE PX_FORCE_INLINE PxVec3 multiplyInertia(const PxVec3& v) const
{
return PxVec3( column[3][0] * v.x + column[3][1] * v.y + column[3][2] * v.z,
column[4][0] * v.x + column[4][1] * v.y + column[4][2] * v.z,
column[5][0] * v.x + column[5][1] * v.y + column[5][2] * v.z
);
}
};
}
#endif

View File

@@ -0,0 +1,38 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef PXG_COMMON_H
#define PXG_COMMON_H
namespace physx
{
//this is needed to force PhysXCommonGpu linkage as Static Library!
void createPxgCommon();
}
#endif

View File

@@ -0,0 +1,42 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef PXG_COMMON_DEFINES_H
#define PXG_COMMON_DEFINES_H
// !! No includes here, only preprocessor definitions!
// A place for shared defines for the GPU libs
#define PXG_MAX_NUM_POINTS_PER_CONTACT_PATCH 6 // corresponding CPU define is CONTACT_REDUCTION_MAX_CONTACTS
#define LOG2_WARP_SIZE 5
#define WARP_SIZE (1U << LOG2_WARP_SIZE)
#define FULL_MASK 0xffffffff //full mask for 32 thread in a warp
#endif

View File

@@ -0,0 +1,35 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef PXG_CONTACTS_DEBUG_H
#define PXG_CONTACTS_DEBUG_H
//if we turn on this flag, we sync the contact data in fetchNarrowPhaseResults. So, we can exam the data in validateContactPairs function in the CPU
#define PXG_CONTACT_VALIDATION 0
#endif

View File

@@ -0,0 +1,92 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef PXG_COPY_MANAGER_H
#define PXG_COPY_MANAGER_H
#include "foundation/PxPinnedArray.h"
#include "cudamanager/PxCudaTypes.h"
#define COPY_KERNEL_WARPS_PER_BLOCK 4
namespace physx
{
class PxCudaContextManager;
class PxCudaContext;
class KernelWrangler;
class PxgHeapMemoryAllocatorManager;
class PxgCopyManager
{
PX_NOCOPY(PxgCopyManager)
public:
PX_ALIGN_PREFIX(16)
struct CopyDesc
{
size_t dest;
size_t source;
size_t bytes;
size_t pad;
PX_CUDA_CALLABLE void operator= (const CopyDesc& ref) volatile
{
dest = ref.dest;
source = ref.source;
bytes = ref.bytes;
}
} PX_ALIGN_SUFFIX(16);
public:
PxgCopyManager(PxgHeapMemoryAllocatorManager* heapMemoryManager);
~PxgCopyManager(){}
void waitAndReset(PxCudaContext* cudaContext);
void pushDeferredHtoD(const CopyDesc& desc);
void dispatchCopy(CUstream stream, PxCudaContextManager* cudaContextManager, KernelWrangler* kernelWrangler);
void createFinishedEvent(PxCudaContext* cudaContext);
void destroyFinishedEvent(PxCudaContext* cudaContext);
protected:
void resetUnsafe() { mNumDescriptors = 0; }
bool hasFinishedCopying(PxCudaContext* cudaContext) const;
PxInt8ArrayPinned mDescriptorsQueue;
PxU32 mNumDescriptors;
CUevent mFinishedEvent;
bool mEventRecorded;
PxgHeapMemoryAllocatorManager* mHeapMemoryManager;
};
}
#endif

View File

@@ -0,0 +1,157 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef PXG_CUDA_BUFFER_H
#define PXG_CUDA_BUFFER_H
#include "foundation/PxPreprocessor.h"
#if PX_LINUX && PX_CLANG
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wdocumentation"
#pragma clang diagnostic ignored "-Wdisabled-macro-expansion"
#endif
#include "cuda.h"
#if PX_LINUX && PX_CLANG
#pragma clang diagnostic pop
#endif
#include "PxgHeapMemAllocator.h"
#include "PxgDevicePointer.h"
namespace physx
{
class PxCudaContext;
class PxgCudaBuffer
{
PX_NOCOPY(PxgCudaBuffer)
public:
PxgCudaBuffer(PxgHeapMemoryAllocatorManager* heapMemoryManager, PxsHeapStats::Enum statGroup)
: mPtr(0)
, mHeapMemoryAllocator(heapMemoryManager->mDeviceMemoryAllocators)
, mSize(0)
, mStatGroup(statGroup)
{
}
~PxgCudaBuffer();
void allocate(const PxU64 size, const char* filename, PxI32 line);
void allocateCopyOldDataAsync(const PxU64 size, PxCudaContext* cudaContext, CUstream stream, const char* filename, PxI32 line);
void deallocate();
/* defer deallocation until the beginning of the next simulation step */
void deallocateDeferred();
PX_FORCE_INLINE CUdeviceptr getDevicePtr() const { return (mPtr + 127) & (~127); }
PX_FORCE_INLINE PxU64 getSize() const { return mSize; }
PX_FORCE_INLINE void set(CUdeviceptr ptr, PxU64 size) { mPtr = ptr; mSize = size; }
static void swapBuffer(PxgCudaBuffer& buf0, PxgCudaBuffer& buf1)
{
const CUdeviceptr tempPtr = buf0.getDevicePtr();
const PxU64 tempSize = buf0.getSize();
buf0.set(buf1.getDevicePtr(), buf1.getSize());
buf1.set(tempPtr, tempSize);
}
void assign(PxgCudaBuffer& b1)
{
PX_ASSERT(mHeapMemoryAllocator == b1.mHeapMemoryAllocator);
PX_ASSERT(mStatGroup == b1.mStatGroup);
deallocate();
mPtr = b1.mPtr;
mSize = b1.mSize;
b1.mPtr = 0;
b1.mSize = 0;
}
protected:
CUdeviceptr mPtr;
PxgHeapMemoryAllocator* mHeapMemoryAllocator;
PxU64 mSize;
const PxsHeapStats::Enum mStatGroup;
};
template <typename T>
class PxgTypedCudaBuffer : public PxgCudaBuffer
{
public:
PxgTypedCudaBuffer(PxgHeapMemoryAllocatorManager* heapMemoryManager, PxsHeapStats::Enum statGroup)
: PxgCudaBuffer(heapMemoryManager, statGroup)
{ }
PX_FORCE_INLINE void allocateElements(const PxU64 nbElements, const char* filename, PxI32 line) { allocate(nbElements * sizeof(T), filename, line); }
PX_FORCE_INLINE PxU64 getNbElements() const { return mSize / sizeof(T); }
PX_FORCE_INLINE PxgDevicePointer<T> getTypedDevicePtr() const { return PxgDevicePointer<T>(getDevicePtr()); }
PX_FORCE_INLINE T* getTypedPtr() const { return reinterpret_cast<T*>(getDevicePtr()); }
};
template <unsigned int NbBuffers>
class PxgCudaBufferN
{
PxU8 mCudaArrays[sizeof(PxgCudaBuffer)*NbBuffers];
public:
PxgCudaBufferN(PxgHeapMemoryAllocatorManager* heapMemoryManager, PxsHeapStats::Enum statGroup)
{
PxgCudaBuffer* buffers = reinterpret_cast<PxgCudaBuffer*>(mCudaArrays);
for (PxU32 i = 0; i < NbBuffers; ++i)
{
PX_PLACEMENT_NEW(&buffers[i], PxgCudaBuffer)(heapMemoryManager, statGroup);
}
}
~PxgCudaBufferN()
{
PxgCudaBuffer* buffers = reinterpret_cast<PxgCudaBuffer*>(mCudaArrays);
for (PxU32 i = 0; i < NbBuffers; ++i)
{
buffers[i].~PxgCudaBuffer();
}
}
PxgCudaBuffer& operator [](PxU32 index) { PX_ASSERT(index < NbBuffers); return reinterpret_cast<PxgCudaBuffer*>(mCudaArrays)[index]; }
const PxgCudaBuffer& operator [](PxU32 index) const { PX_ASSERT(index < NbBuffers); return reinterpret_cast<const PxgCudaBuffer*>(mCudaArrays)[index]; }
PxgCudaBuffer* begin(){ return reinterpret_cast<PxgCudaBuffer*>(mCudaArrays); }
PxU32 size() { return NbBuffers; }
};
}
#endif

View File

@@ -0,0 +1,254 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef PXG_CUDA_HELPERS_H
#define PXG_CUDA_HELPERS_H
#include "foundation/PxPreprocessor.h"
#if PX_LINUX && PX_CLANG
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wdocumentation"
#endif
#include <cuda.h>
#if PX_LINUX && PX_CLANG
#pragma clang diagnostic pop
#endif
#include "cudamanager/PxCudaContextManager.h"
#include "cudamanager/PxCudaContext.h"
#include "foundation/PxFoundation.h"
namespace physx
{
/**
* Templated static functions to simplify common CUDA operations.
*
* General guidelines:
*
* - If you want automatic context acquisition, use the functions that take the context manager as a parameter.
* - Otherwise, directly use the ones that take a cudaContext, but don't forget to acquire the context.
* - For allocations/deallocations, see PxgCudaMemoryAllocator. Use the functions provided there.
* - For anything outside the core SDK, use the helpers provided in the extensions.
*
*/
class PxgCudaHelpers
{
public:
/**
* \brief Copies a device buffer to the host
*
* The cuda context needs to be acquired by the user!
*/
template<typename T>
static void copyDToH(PxCudaContext& cudaContext, T* hostBuffer, const T* deviceBuffer, PxU64 numElements)
{
if (!deviceBuffer || !hostBuffer)
return;
PxU64 numBytes = numElements * sizeof(T);
PxCUresult result = cudaContext.memcpyDtoH(hostBuffer, CUdeviceptr(deviceBuffer), numBytes);
if (result != CUDA_SUCCESS)
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "copyDtoH set failed with error code %i!\n", PxI32(result));
}
/**
* \brief Copies a device buffer to the host
*
* The cuda context will get acquired automatically
*/
template<typename T>
static void copyDToH(PxCudaContextManager& cudaContextManager, T* hostBuffer, const T* deviceBuffer, PxU64 numElements)
{
PxScopedCudaLock _lock(cudaContextManager);
copyDToH(*cudaContextManager.getCudaContext(), hostBuffer, deviceBuffer, numElements);
}
/**
* \brief Copies a host buffer to the device
*
* The cuda context needs to be acquired by the user!
*/
template<typename T>
static void copyHToD(PxCudaContext& cudaContext, T* deviceBuffer, const T* hostBuffer, PxU64 numElements)
{
if (!deviceBuffer || !hostBuffer)
return;
PxU64 numBytes = numElements * sizeof(T);
PxCUresult result = cudaContext.memcpyHtoD(CUdeviceptr(deviceBuffer), hostBuffer, numBytes);
if (result != CUDA_SUCCESS)
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "copyHtoD set failed with error code %i!\n", PxI32(result));
}
/**
* \brief Copies a host buffer to the device
*
* The cuda context will get acquired automatically
*/
template<typename T>
static void copyHToD(PxCudaContextManager& cudaContextManager, T* deviceBuffer, const T* hostBuffer, PxU64 numElements)
{
PxScopedCudaLock _lock(cudaContextManager);
copyHToD(*cudaContextManager.getCudaContext(), deviceBuffer, hostBuffer, numElements);
}
/**
* \brief Schedules device to host copy operation on the specified stream
*
* The cuda context needs to be acquired by the user!
*/
template<typename T>
static void copyDToHAsync(PxCudaContext& cudaContext, T* hostBuffer, const T* deviceBuffer, PxU64 numElements, CUstream stream)
{
if (!deviceBuffer || !hostBuffer)
return;
PxU64 numBytes = numElements * sizeof(T);
PxCUresult result = cudaContext.memcpyDtoHAsync(hostBuffer, CUdeviceptr(deviceBuffer), numBytes, stream);
if (result != CUDA_SUCCESS)
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "copyDtoHAsync set failed with error code %i!\n", PxI32(result));
}
/**
* \brief Schedules device to host copy operation on the specified stream
*
* The cuda context will get acquired automatically
*/
template<typename T>
static void copyDToHAsync(PxCudaContextManager& cudaContextManager, T* hostBuffer, const T* deviceBuffer, PxU64 numElements, CUstream stream)
{
PxScopedCudaLock _lock(cudaContextManager);
copyDToHAsync(*cudaContextManager.getCudaContext(), hostBuffer, deviceBuffer, numElements, stream);
}
/**
* \brief Schedules host to device copy operation on the specified stream
*
* The cuda context needs to be acquired by the user!
*/
template<typename T>
static void copyHToDAsync(PxCudaContext& cudaContext, T* deviceBuffer, const T* hostBuffer, PxU64 numElements, CUstream stream)
{
if (!deviceBuffer || !hostBuffer)
return;
PxU64 numBytes = numElements * sizeof(T);
PxCUresult result = cudaContext.memcpyHtoDAsync(CUdeviceptr(deviceBuffer), hostBuffer, numBytes, stream);
if (result != CUDA_SUCCESS)
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "copyHtoDAsync set failed with error code %i!\n", PxI32(result));
}
/**
* \brief Schedules host to device copy operation on the specified stream
*
* The cuda context will get acquired automatically
*/
template<typename T>
static void copyHToDAsync(PxCudaContextManager& cudaContextManager, T* deviceBuffer, const T* hostBuffer, PxU64 numElements, CUstream stream)
{
PxScopedCudaLock _lock(cudaContextManager);
copyHToDAsync(*cudaContextManager.getCudaContext(), deviceBuffer, hostBuffer, numElements, stream);
}
/**
* \brief Schedules device to device copy operation on the specified stream
*
* The cuda context needs to be acquired by the user!
*/
template<typename T>
static void copyDToDAsync(PxCudaContext& cudaContext, T* dstDeviceBuffer, const T* srcDeviceBuffer, PxU64 numElements, CUstream stream)
{
if (!srcDeviceBuffer || !dstDeviceBuffer)
return;
PxU64 numBytes = numElements * sizeof(T);
PxCUresult result = cudaContext.memcpyDtoDAsync(CUdeviceptr(dstDeviceBuffer), CUdeviceptr(srcDeviceBuffer), numBytes, stream);
if (result != CUDA_SUCCESS)
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "copyDtoDAsync set failed with error code %i!\n", PxI32(result));
}
/**
* \brief Schedules device to device copy operation on the specified stream
*
* The cuda context will get acquired automatically
*/
template<typename T>
static void copyDToDAsync(PxCudaContextManager& cudaContextManager, T* dstDeviceBuffer, const T* srcDeviceBuffer, PxU64 numElements, CUstream stream)
{
PxScopedCudaLock _lock(cudaContextManager);
copyDToDAsync(*cudaContextManager.getCudaContext(), dstDeviceBuffer, srcDeviceBuffer, numElements * sizeof(T), stream);
}
/**
* \brief Schedules a memset operation on the device on the specified stream. Only supported for 1 and 4 byte types.
*
* The cuda context needs to be acquired by the user!
*/
template<typename T>
static void memsetAsync(PxCudaContext& cudaContext, T* dstDeviceBuffer, const T& value, PxU64 numElements, CUstream stream)
{
PX_COMPILE_TIME_ASSERT(sizeof(T) == sizeof(PxU32) || sizeof(T) == sizeof(PxU8));
if (!dstDeviceBuffer)
return;
PxU64 numBytes = numElements * sizeof(T);
PxCUresult result = CUDA_SUCCESS;
if (sizeof(T) == sizeof(PxU32))
result = cudaContext.memsetD32Async(CUdeviceptr(dstDeviceBuffer), reinterpret_cast<const PxU32&>(value), numBytes >> 2, stream);
else
result = cudaContext.memsetD8Async(CUdeviceptr(dstDeviceBuffer), reinterpret_cast<const PxU8&>(value), numBytes, stream);
if (result != CUDA_SUCCESS)
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "Memset failed with error code %i!\n", PxI32(result));
}
/**
* \brief Schedules a memset operation on the device on the specified stream. Only supported for 1 byte or 4 byte data types.
*
* The cuda context will get acquired automatically
*/
template<typename T>
static void memsetAsync(PxCudaContextManager& cudaContextManager, T* dstDeviceBuffer, const T& value, PxU64 numElements, CUstream stream)
{
PxScopedCudaLock _lock(cudaContextManager);
memsetAsync(*cudaContextManager.getCudaContext(), dstDeviceBuffer, value, numElements, stream);
}
};
}; // namespace physx
#endif

View File

@@ -0,0 +1,127 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef PXG_CUDA_MEMORY_ALLOCATOR_H
#define PXG_CUDA_MEMORY_ALLOCATOR_H
#include "foundation/PxPreprocessor.h"
#include "foundation/PxAllocator.h"
#include "foundation/PxSimpleTypes.h"
#include "foundation/PxUserAllocated.h"
#include "cudamanager/PxCudaContextManager.h"
namespace physx
{
class PxCudaContext;
// Whenever possible, use the macros provided below instead of these functions.
void* PxgCudaDeviceMemoryAllocate(PxCudaContext& cudaContext, size_t size, const char* filename, PxI32 line);
void PxgCudaDeviceMemoryDeallocate(PxCudaContext& cudaContext, void* ptr);
void* PxgPinnedMemoryAllocate(PxCudaContext& cudaContext, size_t size, const char* filename, PxI32 line);
void PxgPinnedMemoryDeallocate(PxCudaContext& cudaContext, void* ptr);
// AD: templated easy-access to the allocation functions:
template<typename T>
T* PxgCudaDeviceMemoryAllocate(PxCudaContextManager& cudaContextManager, PxU64 numElements, const char* filename, PxI32 line)
{
PxScopedCudaLock _lock(cudaContextManager);
return reinterpret_cast<T*>(PxgCudaDeviceMemoryAllocate(*cudaContextManager.getCudaContext(), numElements * sizeof(T), filename, line));
}
template<typename T>
void PxgCudaDeviceMemoryDeallocate(PxCudaContextManager& cudaContextManager, T*& ptr)
{
if (ptr)
{
PxScopedCudaLock _lock(cudaContextManager);
PxgCudaDeviceMemoryDeallocate(*cudaContextManager.getCudaContext(), ptr);
ptr = NULL;
}
}
template<typename T>
T* PxgPinnedMemoryAllocate(PxCudaContextManager& cudaContextManager, PxU64 numElements, const char* filename, PxI32 line)
{
PxScopedCudaLock _lock(cudaContextManager);
return reinterpret_cast<T*>(PxgPinnedMemoryAllocate(*cudaContextManager.getCudaContext(), numElements * sizeof(T), filename, line));
}
template<typename T>
void PxgPinnedMemoryDeallocate(PxCudaContextManager& cudaContextManager, T*& ptr)
{
if (ptr)
{
PxScopedCudaLock _lock(cudaContextManager);
PxgPinnedMemoryDeallocate(*cudaContextManager.getCudaContext(), ptr);
ptr = NULL;
}
}
// Pinned Memory allocator - allocates a large block of memory and then suballocates to consumers.
// Can only be grown using reserveAndGrow - no copy will be performed and the grow operation is most
// likely a large allocation - think about performance.
// Grows linearly, only possible to release all the memory at once at the end.
// Consider this a stack-based allocator for all means.
//
// We use this for contact/patch/force streams.
class PxgPinnedHostLinearMemoryAllocator : public PxUserAllocated
{
public:
PxgPinnedHostLinearMemoryAllocator(PxCudaContextManager* contextManager, const PxU64 size);
~PxgPinnedHostLinearMemoryAllocator();
// both of these reserve* operations will invalidate all existing allocations.
void reserve(const PxU64 size);
void reserveAndGrow(const PxU64 size);
void reset(); // will invalidate all allocations.
void* allocate(const PxU64 size, const PxU64 alignment);
private:
void deallocate(); // will deallocate the large base allocation, not the individual chunks!
PxCudaContext* mCudaContext;
public:
PxU8* mStart;
PxU64 mCurrentSize;
PxU64 mTotalSize;
};
}
#define PX_DEVICE_MEMORY_ALLOC(T, cudaContextManager, numElements) PxgCudaDeviceMemoryAllocate<T>(cudaContextManager, numElements, PX_FL)
#define PX_DEVICE_MEMORY_FREE(cudaContextManager, deviceBuffer) PxgCudaDeviceMemoryDeallocate(cudaContextManager, deviceBuffer)
#define PX_PINNED_MEMORY_ALLOC(T, cudaContextManager, numElements) PxgPinnedMemoryAllocate<T>(cudaContextManager, numElements, PX_FL)
#define PX_PINNED_MEMORY_FREE(cudaContextManager, ptr) PxgPinnedMemoryDeallocate(cudaContextManager, ptr)
#endif

View File

@@ -0,0 +1,773 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef PXG_PAGED_FIRST_FIT_HOLE_ALLOCATOR_H
#define PXG_PAGED_FIRST_FIT_HOLE_ALLOCATOR_H
#include "foundation/PxArray.h"
namespace physx
{
namespace cudaMappedMemAllocatorInternal
{
template<typename PointedToT>
struct Pointer
{
PX_FORCE_INLINE Pointer() {}
PX_FORCE_INLINE Pointer(const Pointer& ref) : hostPtr(ref.hostPtr), devPtr(ref.devPtr) {}
PX_FORCE_INLINE explicit Pointer(const PxZERO&) : hostPtr(NULL), devPtr(0) {}
PX_FORCE_INLINE Pointer& operator=(const Pointer& ref)
{
if (&ref != this)
{
hostPtr = ref.hostPtr;
devPtr = ref.devPtr;
}
return *this;
}
PX_FORCE_INLINE Pointer& operator+=(const ptrdiff_t& ref)
{
hostPtr += ref;
devPtr += ref * sizeof(PointedToT);
return *this;
}
PX_FORCE_INLINE Pointer& operator-=(const ptrdiff_t& ref)
{
hostPtr -= ref;
devPtr -= ref * sizeof(PointedToT);
return *this;
}
PX_FORCE_INLINE Pointer operator+(const ptrdiff_t& ref) const
{
Pointer ret;
ret.hostPtr = hostPtr + ref;
ret.devPtr = devPtr + ref * sizeof(PointedToT);
return ret;
}
PX_FORCE_INLINE Pointer operator-(const ptrdiff_t& ref) const
{
Pointer ret;
ret.hostPtr = hostPtr - ref;
ret.devPtr = devPtr - ref * sizeof(PointedToT);
return ret;
}
PX_FORCE_INLINE ptrdiff_t operator-(const Pointer& ref) const
{
ptrdiff_t ret;
ret = hostPtr - ref.hostPtr;
PX_ASSERT(static_cast<ptrdiff_t>(ret * sizeof(PointedToT)) ==
(static_cast<ptrdiff_t>(devPtr)-static_cast<ptrdiff_t>(ref.devPtr)));
return ret;
}
PX_FORCE_INLINE bool operator<(const Pointer& ref) const
{
PX_ASSERT(hostPtr < ref.hostPtr == devPtr < ref.devPtr);
return devPtr < ref.devPtr;
}
PX_FORCE_INLINE bool operator>(const Pointer& ref) const
{
return ref.operator<(*this);
}
PX_FORCE_INLINE bool operator<=(const Pointer& ref) const
{
return !(operator>(ref));
}
PX_FORCE_INLINE bool operator>=(const Pointer& ref) const
{
return !(operator<(ref));
}
PX_FORCE_INLINE bool operator==(const Pointer& ref) const
{
PX_ASSERT((hostPtr == ref.hostPtr) == (devPtr == ref.devPtr));
return devPtr == ref.devPtr;
}
PX_FORCE_INLINE bool operator!=(const Pointer& ref) const
{
return !(operator==(ref));
}
//this is to allow NULL ptr comparison
PX_FORCE_INLINE bool operator==(int i) const
{
PX_ASSERT((hostPtr == NULL) == (devPtr == 0));
return i == 0 && devPtr == 0;
}
PX_FORCE_INLINE bool operator!=(int i) const
{
return !(operator==(i));
}
PointedToT* hostPtr;
CUdeviceptr devPtr;
};
template<>
struct Pointer < void >
{
PX_FORCE_INLINE Pointer() {}
PX_FORCE_INLINE Pointer(const Pointer& ref) : hostPtr(ref.hostPtr), devPtr(ref.devPtr) {}
PX_FORCE_INLINE explicit Pointer(const PxZERO&) : hostPtr(NULL), devPtr(0) {}
PX_FORCE_INLINE Pointer& operator=(const Pointer& ref)
{
if (&ref != this)
{
hostPtr = ref.hostPtr;
devPtr = ref.devPtr;
}
return *this;
}
PX_FORCE_INLINE bool operator<(const Pointer& ref) const
{
PX_ASSERT((hostPtr < ref.hostPtr) == (devPtr < ref.devPtr));
return devPtr < ref.devPtr;
}
PX_FORCE_INLINE bool operator>(const Pointer& ref) const
{
return ref.operator<(*this);
}
PX_FORCE_INLINE bool operator<=(const Pointer& ref) const
{
return !(operator>(ref));
}
PX_FORCE_INLINE bool operator>=(const Pointer& ref) const
{
return !(operator<(ref));
}
PX_FORCE_INLINE bool operator==(const Pointer& ref) const
{
PX_ASSERT((hostPtr == ref.hostPtr) == (devPtr == ref.devPtr));
return devPtr == ref.devPtr;
}
PX_FORCE_INLINE bool operator!=(const Pointer& ref) const
{
return !(operator==(ref));
}
//this is to allow NULL ptr comparison
PX_FORCE_INLINE bool operator==(int i) const
{
PX_ASSERT((hostPtr == NULL) == (devPtr == 0));
return i == 0 && devPtr == 0;
}
PX_FORCE_INLINE bool operator!=(int i) const
{
return !(operator==(i));
}
void* hostPtr;
CUdeviceptr devPtr;
};
template<typename PointedToT>
Pointer<PointedToT> operator+(const ptrdiff_t& argL,
const Pointer<PointedToT>& argR)
{
Pointer<PointedToT> ret;
ret.hostPtr = argR.hostPtr + argL;
ret.devPtr = argR.devPtr + argL * sizeof(PointedToT);
return ret;
}
}
template<typename PointedToT>
static PX_FORCE_INLINE cudaMappedMemAllocatorInternal::Pointer<void> castTo(const cudaMappedMemAllocatorInternal::Pointer<PointedToT>& toPtr)
{
cudaMappedMemAllocatorInternal::Pointer<void> ret;
ret.hostPtr = static_cast<void *>(toPtr.hostPtr);
ret.devPtr = toPtr.devPtr;
return ret;
}
template<typename PointedToT>
static PX_FORCE_INLINE cudaMappedMemAllocatorInternal::Pointer<PointedToT> castTo(const cudaMappedMemAllocatorInternal::Pointer<void>& ref)
{
cudaMappedMemAllocatorInternal::Pointer<PointedToT> ret;
ret.hostPtr = static_cast<PointedToT *>(ref.hostPtr);
ret.devPtr = ref.devPtr;
return ret;
}
}
namespace physx
{
namespace cudaPagedFirstFitHoleAllocatorInternal
{
template<typename AllocT, typename PointedToT>
struct Pointer
{
PX_FORCE_INLINE Pointer() {}
PX_FORCE_INLINE Pointer(const Pointer& ref) : ptr(ref.hostPtr), sz(ref.sz) {}
PX_FORCE_INLINE explicit Pointer(const PxZERO&) : ptr(PxZERO()), sz(0) {}
PX_FORCE_INLINE Pointer& operator=(const Pointer& ref)
{
if (&ref != this)
{
ptr = ref.hostPtr;
sz = ref.sz;
}
return this;
}
PX_FORCE_INLINE Pointer& operator+=(const ptrdiff_t& ref)
{
ptr += ref;
sz = (size_t)-1;
return this;
}
PX_FORCE_INLINE Pointer& operator-=(const ptrdiff_t& ref)
{
ptr -= ref;
sz = (size_t)-1;
return this;
}
PX_FORCE_INLINE Pointer operator+(const ptrdiff_t& ref) const
{
Pointer ret;
ret.ptr = ptr + ref;
ret.sz = (size_t)-1;
return ret;
}
PX_FORCE_INLINE Pointer operator-(const ptrdiff_t& ref) const
{
Pointer ret;
ret.ptr = ptr - ref;
ret.sz = (size_t)-1;
return ret;
}
PX_FORCE_INLINE ptrdiff_t operator-(const Pointer& ref) const
{
ptrdiff_t ret;
ret = ptr - ref.ptr;
return ret;
}
PX_FORCE_INLINE bool operator<(const Pointer& ref) const
{
return ptr < ref.ptr;
}
PX_FORCE_INLINE bool operator>(const Pointer& ref) const
{
return ref.operator<(*this);
}
PX_FORCE_INLINE bool operator<=(const Pointer& ref) const
{
return !(operator>(ref));
}
PX_FORCE_INLINE bool operator>=(const Pointer& ref) const
{
return !(operator<(ref));
}
PX_FORCE_INLINE bool operator==(const Pointer& ref) const
{
return ptr == ref.ptr;
}
PX_FORCE_INLINE bool operator!=(const Pointer& ref) const
{
return !(operator==(ref));
}
//this is to allow NULL ptr comparison
PX_FORCE_INLINE bool operator==(int i) const
{
return i == 0 && ptr == NULL;
}
PX_FORCE_INLINE bool operator!=(int i) const
{
return !(operator==(i));
}
typename AllocT::template PointerType<PointedToT>::type ptr;
size_t sz;
};
template<typename AllocT>
struct Pointer <AllocT, void >
{
PX_FORCE_INLINE Pointer() {}
PX_FORCE_INLINE Pointer(const Pointer& ref) : ptr(ref.ptr), sz(ref.sz) {}
PX_FORCE_INLINE explicit Pointer(const PxZERO&) : ptr(PxZERO()), sz(0) {}
PX_FORCE_INLINE Pointer& operator=(const Pointer& ref)
{
if (&ref != this)
{
ptr = ref.ptr;
sz = ref.sz;
}
return *this;
}
PX_FORCE_INLINE bool operator<(const Pointer& ref) const
{
return ptr < ref.ptr;
}
PX_FORCE_INLINE bool operator>(const Pointer& ref) const
{
return ref.operator<(*this);
}
PX_FORCE_INLINE bool operator<=(const Pointer& ref) const
{
return !(operator>(ref));
}
PX_FORCE_INLINE bool operator>=(const Pointer& ref) const
{
return !(operator<(ref));
}
PX_FORCE_INLINE bool operator==(const Pointer& ref) const
{
return ptr == ref.ptr;
}
PX_FORCE_INLINE bool operator!=(const Pointer& ref) const
{
return !(operator==(ref));
}
//this is to allow NULL ptr comparison
PX_FORCE_INLINE bool operator==(int i) const
{
return i == 0 && ptr == NULL;
}
PX_FORCE_INLINE bool operator!=(int i) const
{
return !(operator==(i));
}
typename AllocT::template PointerType<void>::type ptr;
size_t sz;
};
}
template<typename AllocT, size_t defaultPageBytesize = 1024, size_t holeAlignment = 256>
class PxgCudaPagedFirstFitHoleAllocator
{
PX_NOCOPY(PxgCudaPagedFirstFitHoleAllocator)
public:
//this is a workaround to enable allocators with typedefed pointer types
template<typename PointedToT>
struct PointerType
{
typedef cudaPagedFirstFitHoleAllocatorInternal::Pointer<AllocT ,PointedToT> type;
};
PxgCudaPagedFirstFitHoleAllocator(AllocT& alloc): mAlloc(alloc),
mFirstHoleIdx(-1),
mLastHoleIdx(-1),
mFirstDeinitializedHoleIdx(-1)
{}
virtual ~PxgCudaPagedFirstFitHoleAllocator()
{
resetAndRelease();
}
void resetAndRelease()
{
for (typename PxArray<typename AllocT::template PointerType<PxU8>::type >::Iterator it = mPages.begin(), end = mPages.end(); it != end; ++it)
{
mAlloc.deallocate(castTo<PxU8>(*it));
}
mPages.resize(0);
mHoles.resize(0);
mFirstHoleIdx = -1;
mLastHoleIdx = -1;
mFirstDeinitializedHoleIdx = -1;
}
typename PointerType<void>::type allocate(size_t byteSize)
{
byteSize = (byteSize + holeAlignment - 1) & ~(holeAlignment - 1);
PxI32 idx = mFirstHoleIdx;
while (idx != -1)
{
Hole& hole = mHoles[(PxU32)idx];
if (hole.byteSize >= byteSize)
break;
idx = hole.nextIndex;
}
if (idx == -1)
{
idx = addNewPage(byteSize);
if (idx == -1)
{
return typename PointerType<void>::type(PxZERO());
}
}
Hole& hole = mHoles[(PxU32)idx];
typename PointerType<void>::type ret;
ret.ptr = castTo<PxU8>(hole.ptr);
ret.sz = byteSize;
hole.ptr += (ptrdiff_t)byteSize;
PX_ASSERT(hole.byteSize >= byteSize);
hole.byteSize -= byteSize;
if (hole.byteSize == 0)
{
PxI32& prevIdx = hole.prevIndex != -1 ? mHoles[(PxU32)hole.prevIndex].nextIndex : mFirstHoleIdx;
PxI32& nextIdx = hole.nextIndex != -1 ? mHoles[(PxU32)hole.nextIndex].prevIndex : mLastHoleIdx;
prevIdx = hole.nextIndex;
nextIdx = hole.prevIndex;
deallocateHole(idx);
}
return ret;
}
void deallocate(const typename PointerType<void>::type& ptr)
{
//don't try to deallocate pointers that were calculated, only those obtained from allocate()
PX_ASSERT(ptr.sz != (size_t) -1);
//we don't want to have zero-sized holes from deleting zero-sized allocations (which are valid)
if (ptr.sz == 0 || ptr.sz == (size_t) -1)
return;
deallocateInternal(castTo<PxU8>(ptr.ptr), ptr.sz);
}
#if PX_CHECKED
bool consistencyCheck()
{
for (PxU32 i = 0; i < mHoles.size(); ++i)
{
if (mHoles[i].ptr != NULL && mHoles[i].byteSize == 0)
{
PX_ASSERT(false);
return false;
}
}
PxI32 prevI = -1;
PxU32 holeCtr = 0;
for (PxI32 i = mFirstHoleIdx; i != -1; prevI = i, i = mHoles[(PxU32)i].nextIndex)
{
++holeCtr;
}
if (prevI != mLastHoleIdx)
{
PX_ASSERT(false);
return false;
}
prevI = -1;
PxU32 holeCtr2 = 0;
for (PxI32 i = mLastHoleIdx; i != -1; prevI = i, i = mHoles[(PxU32)i].prevIndex)
{
++holeCtr2;
}
if (prevI != mFirstHoleIdx)
{
PX_ASSERT(false);
return false;
}
if (holeCtr2 != holeCtr)
{
PX_ASSERT(false);
return false;
}
PxU32 holeCtr3 = 0;
for (PxI32 i = mFirstDeinitializedHoleIdx; i != -1; i = mHoles[(PxU32)i].nextIndex)
{
++holeCtr3;
}
if (holeCtr2 + holeCtr3 != mHoles.size())
{
PX_ASSERT(false);
return false;
}
prevI = mFirstHoleIdx;
if (prevI != -1)
{
for (PxI32 i = mHoles[(PxU32) prevI].nextIndex; i != -1; prevI = i, i = mHoles[(PxU32) i].nextIndex)
{
if (mHoles[(PxU32) prevI].ptr + (ptrdiff_t) mHoles[(PxU32) prevI].byteSize >= mHoles[(PxU32) i].ptr)
{
PX_ASSERT(false);
return false;
}
}
}
return true;
}
#endif
protected:
struct Hole
{
Hole(): ptr(PxZERO()), byteSize(0), prevIndex(-1), nextIndex(-1) {}
Hole(PxI32 prev, PxI32 next):
ptr(PxZERO()),
byteSize(0),
prevIndex(prev),
nextIndex(next)
{
}
void initForUse(const typename AllocT::template PointerType<PxU8>::type& p, size_t sz, PxI32 prev, PxI32 next)
{
ptr = p;
byteSize = sz;
prevIndex = prev;
nextIndex = next;
}
void initForPool(PxI32 next)
{
ptr = typename AllocT::template PointerType<PxU8>::type(PxZERO());
byteSize = 0;
prevIndex = -1;
nextIndex = next;
}
typename AllocT::template PointerType<PxU8>::type ptr;
size_t byteSize;
PxI32 prevIndex;
PxI32 nextIndex;
};
PxI32 allocateHole()
{
PxI32 retIdx;
if (mFirstDeinitializedHoleIdx != -1)
{
retIdx = mFirstDeinitializedHoleIdx;
mFirstDeinitializedHoleIdx = mHoles[(PxU32) retIdx].nextIndex;
}
else
{
mHoles.pushBack(Hole(-1, -1));
retIdx = (PxI32) mHoles.size() - 1;
}
return retIdx;
}
void deallocateHole(PxI32 idx)
{
PX_ASSERT(idx < (PxI32) mHoles.size());
mHoles[(PxU32) idx].initForPool(mFirstDeinitializedHoleIdx);
mFirstDeinitializedHoleIdx = idx;
}
PxPair<PxI32, PxI32> findPrevAndNextHoles(const typename AllocT::template PointerType<PxU8>::type& ptr)
{
PxI32 i = mFirstHoleIdx;
while (i != -1 && ptr > mHoles[(PxU32) i].ptr)
i = mHoles[(PxU32) i].nextIndex;
if (i == -1)
{
return PxPair<PxI32, PxI32>(mLastHoleIdx, -1);
}
else
{
return PxPair<PxI32, PxI32>(mHoles[(PxU32) i].prevIndex, i);
}
}
PxI32 deallocateInternal(const typename AllocT::template PointerType<PxU8>::type& ptr, size_t sz)
{
PxPair<PxI32, PxI32> prevAndNext = findPrevAndNextHoles(ptr);
PxI32 newHole = -1;
if (prevAndNext.first != -1)
{
Hole& prevHole = mHoles[(PxU32) prevAndNext.first];
PX_ASSERT(prevHole.ptr + (ptrdiff_t) prevHole.byteSize <= ptr);
if (prevHole.ptr + (ptrdiff_t) prevHole.byteSize == ptr)
{
prevHole.byteSize += sz;
newHole = prevAndNext.first;
}
}
if (newHole == -1)
{
newHole = allocateHole();
Hole& hole = mHoles[(PxU32) newHole];
hole.initForUse(ptr, sz, prevAndNext.first, prevAndNext.second);
PxI32& prevIdx = prevAndNext.first != -1 ? mHoles[(PxU32) prevAndNext.first].nextIndex : mFirstHoleIdx;
prevIdx = newHole;
PxI32& nextIdx = prevAndNext.second != -1 ? mHoles[(PxU32) prevAndNext.second].prevIndex : mLastHoleIdx;
nextIdx = newHole;
}
Hole& hole = mHoles[(PxU32) newHole];
if (prevAndNext.second != -1)
{
Hole& nextHole = mHoles[(PxU32) prevAndNext.second];
PX_ASSERT(ptr + (ptrdiff_t) sz <= nextHole.ptr);
if (ptr + (ptrdiff_t) sz == nextHole.ptr)
{
hole.byteSize += nextHole.byteSize;
hole.nextIndex = nextHole.nextIndex;
PxI32& nextIdx = hole.nextIndex != -1 ? mHoles[(PxU32) hole.nextIndex].prevIndex : mLastHoleIdx;
nextIdx = newHole;
deallocateHole(prevAndNext.second);
}
}
return newHole;
}
PxI32 addNewPage(size_t requestedAllocByteSize)
{
size_t sz = PxMax(requestedAllocByteSize, defaultPageBytesize);
mPages.pushBack(castTo<PxU8>(mAlloc.allocate(sz)));
PX_ASSERT(mPages.back() != 0);
if (mPages.back() == 0)
{
mPages.popBack();
return -1;
}
//by "deallocating" the newly allocated mem we put into the sorted free mem list - holes management inside
PxI32 newHole = deallocateInternal(mPages.back(), sz);
return newHole;
}
AllocT& mAlloc;
PxArray<typename AllocT::template PointerType<PxU8>::type > mPages;
PxArray<Hole> mHoles;
PxI32 mFirstHoleIdx;
PxI32 mLastHoleIdx;
PxI32 mFirstDeinitializedHoleIdx; //pool of holes
};
template<typename AllocT, size_t defaultPageBytesize, size_t holeAlignment, typename PointedToT>
typename PxgCudaPagedFirstFitHoleAllocator<AllocT, defaultPageBytesize, holeAlignment>::template PointerType<PointedToT>::type operator+(const ptrdiff_t& argL,
const typename PxgCudaPagedFirstFitHoleAllocator<AllocT, defaultPageBytesize, holeAlignment>::template PointerType<PointedToT>::type& argR)
{
typename PxgCudaPagedFirstFitHoleAllocator<AllocT, defaultPageBytesize, holeAlignment>::template PointerType<PointedToT>::type ret;
ret.ptr = argR.ptr + argL;
ret.sz = (size_t) -1;
return ret;
}
}
#endif

View File

@@ -0,0 +1,188 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef PXG_CUDA_PAGED_LINEAR_ALLOCATOR_H
#define PXG_CUDA_PAGED_LINEAR_ALLOCATOR_H
#include "foundation/PxArray.h"
#include "foundation/PxMath.h"
#include "foundation/PxMutex.h"
namespace physx
{
template<typename AllocT>
class PxgCudaPagedLinearAllocator
{
PX_NOCOPY(PxgCudaPagedLinearAllocator)
public:
PxgCudaPagedLinearAllocator(AllocT& alloc, const size_t defaultPageBytesize = 1024 * 1024 ) :
mAlloc(alloc),
mCurrOffsetBytes(0),
mCurrPage(0),
mDefaultPageBytesize(defaultPageBytesize)
{}
virtual ~PxgCudaPagedLinearAllocator()
{
resetAndRelease();
}
void resetAndRelease()
{
reset();
for (PxU32 i = 0; i < mPages.size(); ++i)
{
mAlloc.deallocate(mPages[i]);
}
mPages.resize(0);
}
void reset()
{
mCurrPage = 0;
mCurrOffsetBytes = 0;
mCurrPageSize = mPagesSize.size() == 0 ? 0 : mPagesSize[0];
}
//attention: no alignment!
void* allocate(size_t byteSize)
{
bool outOfMem = false;
bool empty = mPages.empty();
if (!empty && (mCurrOffsetBytes + byteSize) >= mCurrPageSize)
{
mCurrOffsetBytes = 0;
++mCurrPage;
if (mCurrPage < mPages.size())
mCurrPageSize = mPagesSize[mCurrPage];
else
mCurrPageSize = 0;
}
if (empty || mCurrOffsetBytes + byteSize >= mCurrPageSize)
{
//Let's first iterate through all the pages to find if any are large-enough
bool found = false;
for (PxU32 i = mCurrPage; i < mPages.size(); ++i)
{
if (mPagesSize[i] >= byteSize)
{
found = true;
mCurrPage = i;
mCurrPageSize = mPagesSize[i];
break;
}
}
if (!found)
outOfMem = !addNewPage(byteSize);
}
if (outOfMem)
{
return NULL;
}
ptrdiff_t offs = (ptrdiff_t)mCurrOffsetBytes;
mCurrOffsetBytes += byteSize;
return mPages[mCurrPage] + offs;
}
void* allocateAligned(size_t alignment, size_t byteSize)
{
#if 0
size_t pad = alignment - 1 + sizeof(size_t); // store offset for delete.
#else
size_t pad = alignment - 1;
#endif
size_t newByteSize = byteSize + pad;
PxU8* basePtr = reinterpret_cast<PxU8*>(allocate(newByteSize));
#if 0
size_t ptrAligningOffset = basePtr.getAligningOffset(alignment, sizeof(size_t));
typename AllocT::template Pointer<PxU8> offsetPtr = basePtr + ptrAligningOffset;
// wide mask
((size_t*)ptr)[-1] = size_t(ptr - base); // store offset
#else
size_t alignOffs = alignment - (size_t(basePtr) & (alignment - 1));
size_t ptrAligningOffset = (alignOffs & (alignment - 1));
PxU8* offsetPtr = basePtr + (ptrdiff_t) ptrAligningOffset;
#endif
return offsetPtr;
}
PxMutex mMutex;
protected:
bool addNewPage(size_t requestedAllocByteSize)
{
const size_t size = PxMax(requestedAllocByteSize, mDefaultPageBytesize);
mPages.pushBack(reinterpret_cast<PxU8*>(mAlloc.allocate(size, 0, PX_FL)));
mPagesSize.pushBack(size);
PX_ASSERT(mPages.back() != 0);
if (mPages.back() == 0)
{
mPages.popBack();
return false;
}
mCurrOffsetBytes = 0;
mCurrPage = mPages.size() - 1;
mCurrPageSize = size;
return true;
}
AllocT& mAlloc;
PxArray<PxU8*> mPages;
PxArray<size_t> mPagesSize;
size_t mCurrOffsetBytes;
PxU32 mCurrPage;
size_t mCurrPageSize;
size_t mDefaultPageBytesize;
};
}
#endif

View File

@@ -0,0 +1,116 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef PXG_CUDA_UTILS_H
#define PXG_CUDA_UTILS_H
#include "cuda.h"
#include "foundation/PxErrors.h"
#include "foundation/PxAssert.h"
#include "foundation/PxFoundation.h"
#include "foundation/PxPreprocessor.h"
#include "foundation/PxTime.h"
#include "cudamanager/PxCudaContext.h"
namespace physx
{
/**
Utility function to synchronize 2 streams. This causes dependentStream to wait for parentStream to complete its current queued workload before proceeding further.
*/
PX_INLINE void synchronizeStreams(PxCudaContext* cudaContext, const CUstream& parentStream, const CUstream& dependentStream)
{
CUevent ev = 0;
cudaContext->eventCreate(&ev, CU_EVENT_DISABLE_TIMING);
CUresult result = cudaContext->eventRecord(ev, parentStream);
if (result != CUDA_SUCCESS)
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "SynchronizeStreams cuEventRecord failed with error %i\n", result);
PX_ASSERT(result == CUDA_SUCCESS);
result = cudaContext->streamWaitEvent(dependentStream, ev);
if (result != CUDA_SUCCESS)
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "SynchronizeStreams cuStreamWaitEvent failed with error %i\n", result);
PX_ASSERT(result == CUDA_SUCCESS);
cudaContext->eventDestroy(ev);
}
/**
Utility function to synchronize 2 streams. This causes dependentStream to wait for parentStream to complete its current queued workload before proceeding further.
*/
PX_INLINE void synchronizeStreams(PxCudaContext* cudaContext, CUstream& parentStream, CUstream& dependentStream, CUevent& ev)
{
//CUevent ev;
//mCudaContext->eventCreate(&ev, CU_EVENT_DISABLE_TIMING);
CUresult result = cudaContext->eventRecord(ev, parentStream);
if (result != CUDA_SUCCESS)
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "SynchronizeStreams cuEventRecord failed with error %i\n", result);
PX_ASSERT(result == CUDA_SUCCESS);
result = cudaContext->streamWaitEvent(dependentStream, ev);
if (result != CUDA_SUCCESS)
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "SynchronizeStreams cuStreamWaitEvent failed with error %i\n", result);
PX_ASSERT(result == CUDA_SUCCESS);
//mCudaContext->eventDestroy(ev);
}
PX_FORCE_INLINE void* getMappedDevicePtr(PxCudaContext* cudaContext, void* cpuPtr)
{
CUdeviceptr dPtr = 0;
cudaContext->memHostGetDevicePointer(&dPtr, cpuPtr, 0);
return reinterpret_cast<void*>(dPtr);
}
PX_FORCE_INLINE const void* getMappedDeviceConstPtr(PxCudaContext* cudaContext, const void* cpuPtr)
{
return getMappedDevicePtr(cudaContext, const_cast<void*>(cpuPtr));
}
PX_FORCE_INLINE bool spinWait(volatile PxU32& waitValue, const PxReal timeoutValue)
{
PxTime time;
while (waitValue == 0)
{
if (PxReal(time.peekElapsedSeconds()) >= timeoutValue)
return false;
}
return true;
}
}
#endif

View File

@@ -0,0 +1,54 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef PXG_DEVICE_POINTER_H
#define PXG_DEVICE_POINTER_H
#include "foundation/PxPreprocessor.h"
namespace physx
{
//This should be a basic pointer wrapper that has the same memory footprint as a raw pointer. Please don't add additional members to the struct.
template <typename T>
struct PxgDevicePointer
{
CUdeviceptr mPtr;
PxgDevicePointer(CUdeviceptr ptr) : mPtr(ptr) {}
operator CUdeviceptr& () { return mPtr; }
operator CUdeviceptr() const { return mPtr; }
T* getPointer() const { return reinterpret_cast<T*>(mPtr); }
};
PX_COMPILE_TIME_ASSERT(sizeof(PxgDevicePointer<PxU32>) == sizeof(CUdeviceptr));
}
#endif

View File

@@ -0,0 +1,178 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef PXG_HEAP_MEM_ALLOCATOR_H
#define PXG_HEAP_MEM_ALLOCATOR_H
#include "foundation/PxAssert.h"
#include "foundation/PxBitUtils.h"
#include "foundation/PxHashMap.h"
#include "foundation/PxArray.h"
#include "foundation/PxPool.h"
#include "foundation/PxMutex.h"
#include "PxsHeapMemoryAllocator.h"
#include "common/PxPhysXCommonConfig.h"
#if PX_DEBUG
#include "PxgMemoryTracker.h"
#endif
namespace physx
{
class PxsMemoryManager;
class BlockHeader
{
public:
PX_FORCE_INLINE BlockHeader(const PxU32 offset, PxU32 rootIndex) : mOffset(offset), mRootIndex(rootIndex), mPrev(NULL), mNext(NULL)
{
}
PX_FORCE_INLINE void initialize(const PxU32 rootIndex, const PxU32 offset)
{
mOffset = offset;
mRootIndex = rootIndex;
mPrev = NULL;
mNext = NULL;
}
PxU32 mOffset;
PxU32 mRootIndex;
BlockHeader* mPrev;
BlockHeader* mNext;
};
#define PXG_INVALID_BLOCK 0xFFFFFFFF
class Block
{
public:
PX_FORCE_INLINE Block() : mStartHeader(NULL), mEndHeader(NULL), mHeaderSizes(0) {}
PX_FORCE_INLINE bool isEmpty() { return mHeaderSizes == 0; }
void insertBlockHeader(const PxU32 rootIndex, const PxU32 offset, PxPool<BlockHeader>& pool);
void removeBlockHeader(BlockHeader* header, PxPool<BlockHeader>& pool);
BlockHeader* findBuddy(const PxU32 offsetToFind, const PxU32 rootIndex);
BlockHeader* getFreeBlocks(){ return mEndHeader; }
bool isValid();
BlockHeader* mStartHeader;
BlockHeader* mEndHeader;
PxU32 mHeaderSizes;
PxU32 mBlockSize;
PxU32 mBlockIndex;
};
class AllocationValue
{
public:
PX_FORCE_INLINE AllocationValue(const PxU32 blockIndex, const PxU32 rootIndex, const size_t byteSize, const int group)
: mBlockIndex(blockIndex), mRootIndex(rootIndex), mByteSize(byteSize), mGroup(group)
{}
PxU32 mBlockIndex;
PxU32 mRootIndex;
size_t mByteSize;
int mGroup;
};
struct ExceptionalAlloc
{
void* address;
size_t size;
};
class PxgHeapMemoryAllocator : public PxsHeapMemoryAllocator
{
public:
PxgHeapMemoryAllocator(const PxU32 byteSize, PxVirtualAllocatorCallback* allocator);
~PxgHeapMemoryAllocator();
void initializeBlocks(const PxU32 rootIndex);
//return a free block index
PxU32 getNextFreeBlock(const PxU32 blockIndex, const PxU32 allocationSize, const char* file, const int line);
// PxVirtualAllocatorCallback
virtual void* allocate(const size_t byteSize, const int group, const char* file, const int line) PX_OVERRIDE;
virtual void deallocate(void* ptr) PX_OVERRIDE;
//~PxVirtualAllocatorCallback
void deallocateDeferred(void* ptr);
void flushDeferredDeallocs();
PxU64 getTotalSize();
PxsHeapStats& getHeapStats() { return mHeapStats; }
#if PX_DEBUG || PX_STOMP_ALLOCATED_MEMORY
PxVirtualAllocatorCallback* getAllocator() { return mAllocator; } //Used for memcheck support
#endif
private:
PxHashMap<void*, AllocationValue> mHashMap;//this is used to look up where the block is
PxArray<Block> mBlocks; //this is used to store fix size slots
PxVirtualAllocatorCallback* mAllocator;
PxArray<void*> mRoots;
PxArray<ExceptionalAlloc> mExceptionalAllocs;
PxArray<void*> deferredDeallocs;
PxU32 mAllocationSize;
PxU32 mBitfield;
PxU64 mTotalMem;
PxsHeapStats mHeapStats;
PxPool<BlockHeader> mBlockHeaderPool;
PxMutex mMutex;
#if PX_DEBUG
MemTracker mMemTracker;
#endif
PX_NOCOPY(PxgHeapMemoryAllocator)
};
class PxgHeapMemoryAllocatorManager : public PxsHeapMemoryAllocatorManager
{
public:
PxgHeapMemoryAllocatorManager(PxU32 heapCapacity, PxsMemoryManager* memoryManager);
virtual ~PxgHeapMemoryAllocatorManager();
// PxsHeapMemoryAllocatorManager
virtual PxU64 getDeviceMemorySize() const PX_OVERRIDE PX_FINAL;
virtual PxsHeapStats getDeviceHeapStats() const PX_OVERRIDE PX_FINAL;
virtual void flushDeferredDeallocs() PX_OVERRIDE PX_FINAL;
//~PxsHeapMemoryAllocatorManager
PxgHeapMemoryAllocator* mDeviceMemoryAllocators;
};
}
#endif

View File

@@ -0,0 +1,104 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef PXG_INTRINSICS_H
#define PXG_INTRINSICS_H
#include "foundation/PxSimpleTypes.h"
#include "cuda.h"
#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
#define __STG_PTR "l"
#else
#define __STG_PTR "r"
#endif
namespace physx
{
#if __CUDA_ARCH__ >= 350
template <typename T>
PX_FORCE_INLINE PX_CUDA_CALLABLE T Pxldca(const T& t) { return __ldca(&t); } //Cache all levels
template <typename T>
PX_FORCE_INLINE PX_CUDA_CALLABLE T Pxldcg(const T& t) { return __ldcg(&t); } //Cache at global level (not L1)
template <typename T>
PX_FORCE_INLINE PX_CUDA_CALLABLE T Pxldg(const T& t) { return __ldg(&t); } //Load global
template <typename T>
PX_FORCE_INLINE PX_CUDA_CALLABLE T Pxldcs(const T& t) { return __ldcs(&t); } //Cache streaming, likely to touch once
template <typename T>
PX_FORCE_INLINE PX_CUDA_CALLABLE T Pxldlu(const T& t) { return __ldlu(&t); } // Last use
template <typename T>
PX_FORCE_INLINE PX_CUDA_CALLABLE T Pxldcv(const T& t) { return __ldcv(&t); } //Don't catch and fetch again
#if __CUDACC_VER_MAJOR__ >= 11
template <typename T>
PX_FORCE_INLINE PX_CUDA_CALLABLE void Pxstwb(T* dst, const T& src) { __stwb(dst, src); } //Cache write-back all levels
template <typename T>
PX_FORCE_INLINE PX_CUDA_CALLABLE void Pxstcg(T* dst, const T& src) { __stcg(dst, src); } //Cache at global (not L1)
template <typename T>
PX_FORCE_INLINE PX_CUDA_CALLABLE void Pxstcs(T* dst, const T& src) { __stcs(dst, src); } // Cache streaming (accessed once)
template <typename T>
PX_FORCE_INLINE PX_CUDA_CALLABLE void Pxstwt(T* dst, const T& src) { __stwt(dst, src); } // Cache write through (no caching)
#else
template <typename T>
PX_FORCE_INLINE PX_CUDA_CALLABLE void Pxstwb(T* dst, const T& src) { *dst = src; } //Cache write-back all levels
template <typename T>
PX_FORCE_INLINE PX_CUDA_CALLABLE void Pxstcg(T* dst, const T& src) { *dst = src; } //Cache at global (not L1)
template <typename T>
PX_FORCE_INLINE PX_CUDA_CALLABLE void Pxstcs(T* dst, const T& src) { *dst = src; } // Cache streaming (accessed once)
template <typename T>
PX_FORCE_INLINE PX_CUDA_CALLABLE void Pxstwt(T* dst, const T& src) { *dst = src; } // Cache write through (no caching)
#endif
#else
template <typename T>
PX_FORCE_INLINE PX_CUDA_CALLABLE T Pxldca(const T& t) { return t; } //Cache all levels
template <typename T>
PX_FORCE_INLINE PX_CUDA_CALLABLE T Pxldcg(const T& t) { return t; } //Cache at global level (not L1)
template <typename T>
PX_FORCE_INLINE PX_CUDA_CALLABLE T Pxldg(const T& t) { return t; } //Load global
template <typename T>
PX_FORCE_INLINE PX_CUDA_CALLABLE T Pxldcs(const T& t) { return t; } //Cache streaming, likely to touch once
template <typename T>
PX_FORCE_INLINE PX_CUDA_CALLABLE T Pxldlu(const T& t) { return t; } // Last use
template <typename T>
PX_FORCE_INLINE PX_CUDA_CALLABLE T Pxldcv(const T& t) { return t; } //Don't catch and fetch again
template <typename T>
PX_FORCE_INLINE PX_CUDA_CALLABLE void Pxstwb(T* dst, const T& src) { *dst = src; } //Cache write-back all levels
template <typename T>
PX_FORCE_INLINE PX_CUDA_CALLABLE void Pxstcg(T* dst, const T& src) { *dst = src; } //Cache at global (not L1)
template <typename T>
PX_FORCE_INLINE PX_CUDA_CALLABLE void Pxstcs(T* dst, const T& src) { *dst = src; } // Cache streaming (accessed once)
template <typename T>
PX_FORCE_INLINE PX_CUDA_CALLABLE void Pxstwt(T* dst, const T& src) { *dst = src; } // Cache write through (no caching)
#endif
}
#endif

View File

@@ -0,0 +1,47 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef PXG_KERNEL_INDICES_H
#define PXG_KERNEL_INDICES_H
namespace physx
{
struct PxgKernelIds
{
enum
{
#define KERNEL_DEF(id, name) id,
#include "PxgKernelNames.h"
#undef KERNEL_DEF
KERNEL_COUNT
};
};
}
#endif

View File

@@ -0,0 +1,618 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef PXG_KERNEL_NAMES_H
#define PXG_KERNEL_NAMES_H
///////////////////////////////////////////////
//simulation controller kernels
///////////////////////////////////////////////
KERNEL_DEF(MERGE_AABBMGR_HANDLES, "mergeChangedAABBMgrHandlesLaunch")
KERNEL_DEF(UPDATE_BODY_EXTERNAL_VELOCITIES, "updateBodyExternalVelocitiesLaunch")
KERNEL_DEF(UPDATE_SHAPES, "updateShapesLaunch")
KERNEL_DEF(UPDATE_BODIES, "updateBodiesLaunch")
KERNEL_DEF(UPDATE_BODIES_DIRECT_API, "updateBodiesLaunchDirectAPI")
KERNEL_DEF(NEW_ARTICULATIONS, "newArticulationsLaunch")
KERNEL_DEF(UPDATE_ARTICULATIONS, "updateArticulationsLaunch")
KERNEL_DEF(UPDATE_JOINTS, "updateJointsLaunch")
KERNEL_DEF(UPDATE_TRANSFORMCACHE_AND_BOUNDARRAY, "updateTransformCacheAndBoundArrayLaunch")
KERNEL_DEF(MERGE_TRANSFORMCACHE_AND_BOUNDARRAY_CHANGES, "mergeTransformCacheAndBoundArrayChanges")
KERNEL_DEF(UPDATE_AABBMGR_HANDLES, "updateChangedAABBMgrHandlesLaunch")
KERNEL_DEF(COMPUTE_FROZEN_UNFROZEN_HISTOGRAM, "computeFrozenAndUnfrozenHistogramLaunch")
KERNEL_DEF(OUTPUT_FROZEN_UNFROZEN_HISTOGRAM, "outputFrozenAndUnfrozenHistogram")
KERNEL_DEF(CREATE_FROZEN_UNFROZEN_ARRAY, "createFrozenAndUnfrozenArray")
//////////////////////////////////////////////
//broad phase kernels
/////////////////////////////////////////////
KERNEL_DEF(BP_TRANSLATE_AABBS, "translateAABBsLaunch")
KERNEL_DEF(BP_MARK_DELETEDPAIRS, "markRemovedPairsLaunch")
KERNEL_DEF(BP_UPDATE_DELETEDPAIRS, "markRemovedPairsProjectionsLaunch")
KERNEL_DEF(BP_UPDATE_UPDATEDPAIRS, "markUpdatedPairsLaunch")
KERNEL_DEF(BP_UPDATE_UPDATEDPAIRS2, "markUpdatedPairsLaunch2")
KERNEL_DEF(BP_UPDATE_CREATEDPAIRS, "markCreatedPairsLaunch")
KERNEL_DEF(BP_INITIALIZE_SAPBOX, "initializeSapBox1DLaunch")
KERNEL_DEF(BP_COMPUTE_ENDPT_HISTOGRAM, "computeEndPtsHistogram")
KERNEL_DEF(BP_OUTPUT_ENDPT_HISTOGRAM, "outputEndPtsHistogram")
KERNEL_DEF(BP_CREATE_REGIONS, "createRegionsKernel")
KERNEL_DEF(BP_COMPUTE_START_REGION_HISTOGRAM, "computeStartRegionsHistogram")
KERNEL_DEF(BP_OUTPUT_START_REGION_HISTOGRAM, "outputStartRegionsHistogram")
KERNEL_DEF(BP_COMPUTE_REGION_HISTOGRAM, "computeRegionsHistogram")
KERNEL_DEF(BP_OUTPUT_REGION_HISTOGRAM, "outputRegionsHistogram")
KERNEL_DEF(BP_WRITEOUT_ACTIVE_HISTOGRAM, "writeOutStartAndActiveRegionHistogram")
KERNEL_DEF(BP_COMPUTE_ACTIVE_HISTOGRAM, "computeStartAndActiveRegionHistogram")
KERNEL_DEF(BP_OUTPUT_ACTIVE_HISTOGRAM, "outputOrderedActiveRegionHistogram")
KERNEL_DEF(BP_COMPUTE_OVERLAPCHECKS_HISTOGRAM, "computeOverlapChecksForRegionsHistogram")
KERNEL_DEF(BP_OUTPUT_OVERLAPCHECKS_HISTOGRAM, "outputOverlapChecksForRegionHistogram")
KERNEL_DEF(BP_CLEAR_NEWFLAG, "clearNewFlagLaunch")
KERNEL_DEF(BP_INITIALIZE_RANKS, "initializeRadixRanks")
KERNEL_DEF(BP_UDPATE_HANDLES, "updateHandles")
KERNEL_DEF(BP_COMPUTE_INCREMENTAL_CMP_COUNTS1, "computeIncrementalComparisonHistograms_Stage1")
KERNEL_DEF(BP_COMPUTE_INCREMENTAL_CMP_COUNTS2, "computeIncrementalComparisonHistograms_Stage2")
KERNEL_DEF(BP_INCREMENTAL_SAP, "performIncrementalSAP")
KERNEL_DEF(BP_GENERATE_FOUNDPAIR_NEWBOUNDS, "generateFoundPairsForNewBoundsRegion")
KERNEL_DEF(BP_WRITEOUT_OVERLAPCHECKS_HISTOGRAM_NEWBOUNDS, "writeOutOverlapChecksForInsertedBoundsRegionsHistogram")
KERNEL_DEF(BP_ACCUMULATE_REPORT_STAGE_1, "accumulateReportsStage_1")
KERNEL_DEF(BP_ACCUMULATE_REPORT_STAGE_2, "accumulateReportsStage_2")
KERNEL_DEF(BP_COPY_REPORTS, "copyReports")
///////////////////////////////////////////////
//narrow phase kernels
//////////////////////////////////////////////
KERNEL_DEF(FINISH_CONTACTS_KERNEL, "finishContactsKernel")
KERNEL_DEF(MEM_COPY_BALANCED_KERNEL, "MemCopyBalanced")
KERNEL_DEF(REMOVE_CONTACT_MANAGERS_1, "removeContactManagers_Stage1")
KERNEL_DEF(REMOVE_CONTACT_MANAGERS_2, "removeContactManagers_Stage2")
KERNEL_DEF(REMOVE_CONTACT_MANAGERS_3, "removeContactManagers_Stage3")
KERNEL_DEF(REMOVE_CONTACT_MANAGERS_4, "removeContactManagers_Stage4")
KERNEL_DEF(REMOVE_CONTACT_MANAGERS_5, "removeContactManagers_Stage5")
KERNEL_DEF(COMPACT_LOST_FOUND_PAIRS_1, "prepareLostFoundPairs_Stage1")
KERNEL_DEF(COMPACT_LOST_FOUND_PAIRS_2, "prepareLostFoundPairs_Stage2")
KERNEL_DEF(SPHERE_KERNEL_MAIN, "sphereNphase_Kernel")
KERNEL_DEF(BOX_BOX_KERNEL_MAIN, "boxBoxNphase_Kernel")
KERNEL_DEF(CONVEX_PLANE_KERNEL_MAIN, "convexPlaneNphase_Kernel")
KERNEL_DEF(CONVEXCORE_PLANE_KERNEL_MAIN, "convexCorePlaneNphase_Kernel")
KERNEL_DEF(CONVEXCORE_CONVEX_KERNEL_MAIN, "convexCoreConvexNphase_Kernel")
KERNEL_DEF(CONVEXCORE_TRIMESH_KERNEL32_MAIN, "convexCoreTrimeshNphase_Kernel32")
KERNEL_DEF(CONVEXCORE_TETMESH_KERNEL32_MAIN, "convexCoreTetmeshNphase_Kernel32")
KERNEL_DEF(CONVEXCORE_CLOTHMESH_KERNEL32_MAIN, "convexCoreClothmeshNphase_Kernel32")
KERNEL_DEF(CONVEX_CONVEX_KERNEL_EARLY_OUT, "convexConvexNphase_stage1Kernel")
KERNEL_DEF(CONVEX_CONVEX_KERNEL_MAIN, "convexConvexNphase_stage2Kernel")
KERNEL_DEF(REMOVE_CONTACT_MANAGERS_5_CVXTRI, "removeContactManagers_Stage5_CvxTri")
KERNEL_DEF(INITIALIZE_MANIFOLDS, "initializeManifolds")
KERNEL_DEF(CONVEX_TRIMESH_MIDPHASE, "midphaseGeneratePairs")
KERNEL_DEF(CONVEX_TRIMESH_CORE, "convexTrimeshNarrowphase")
KERNEL_DEF(CONVEX_TRIMESH_SORT_TRIANGLES, "sortTriangleIndices")
KERNEL_DEF(CONVEX_TRIMESH_POST_PROCESS, "convexTrimeshPostProcess")
KERNEL_DEF(CONVEX_HEIGHTFIELD_POST_PROCESS, "convexHeightfieldPostProcess")
KERNEL_DEF(CONVEX_TRIMESH_CORRELATE, "convexTrimeshCorrelate")
KERNEL_DEF(CONVEX_TRIMESH_FINISHCONTACTS, "convexTrimeshFinishContacts")
KERNEL_DEF(CONVEX_HEIGHTFIELD_MIDPHASE, "convexHeightFieldMidphase")
KERNEL_DEF(CONVEX_HEIGHTFIELD_CORE, "convexHeightfieldNarrowphase")
KERNEL_DEF(SPHERE_TRIMESH_CORE, "sphereTrimeshNarrowphase")
KERNEL_DEF(SPHERE_HEIGHTFIELD_CORE, "sphereHeightfieldNarrowphase")
KERNEL_DEF(TRIMESH_PLANE_CORE, "trimeshPlaneNarrowphase")
KERNEL_DEF(TRIMESH_HEIGHTFIELD_CORE, "trimeshHeightfieldNarrowphase")
KERNEL_DEF(TRIMESH_TRIMESH_CORE, "triangleTriangleCollision")
KERNEL_DEF(TRIMESH_TRIMESH_OVERLAP, "triangleTriangleOverlaps")
KERNEL_DEF(EVALUATE_POINT_DISTANCES_SDF, "evaluatePointDistancesSDFBatch")
KERNEL_DEF(UPDATE_FRICTION_PATCHES, "updateFrictionPatches")
////////////////////////////////////////////////////////////////
//solver kernels
///////////////////////////////////////////////////////////////
KERNEL_DEF(PRE_INTEGRATION, "preIntegrationLaunch")
KERNEL_DEF(CONTACT_CONSTRAINT_PREPREP_BLOCK, "constraintContactBlockPrePrepLaunch")
KERNEL_DEF(JOINT_CONSTRAINT_PREPREP, "constraint1DPrePrepLaunch")
KERNEL_DEF(JOINT_CONSTRAINT_PREPREP_BLOCK, "constraint1DBlockPrePrepLaunch")
KERNEL_DEF(JOINT_CONSTRAINT_PREPARE_BLOCK_PARALLEL, "jointConstraintBlockPrepareParallelLaunch")
KERNEL_DEF(CONTACT_CONSTRAINT_PREPARE_BLOCK_PARALLEL, "contactConstraintBlockPrepareParallelLaunch")
KERNEL_DEF(ZERO_BODIES, "ZeroBodies")
KERNEL_DEF(SOLVE_BLOCK_PARTITION, "solveBlockPartition")
KERNEL_DEF(CONCLUDE_BLOCKS, "concludeBlocks")
KERNEL_DEF(WRITEBACK_BLOCKS, "writebackBlocks")
KERNEL_DEF(WRITE_BACK_BODIES, "writeBackBodies")
KERNEL_DEF(COMPUTE_AVERAGE_VELOCITY, "computeAverageSolverBodyVelocity")
KERNEL_DEF(PROPAGATE_BODY_VELOCITY, "propagateSolverBodyVelocity")
KERNEL_DEF(INITIALIZE_INPUT_AND_RANKS_B, "initialRanksAndBodyIndexB")
KERNEL_DEF(INITIALIZE_INPUT_AND_RANKS_A, "initialRanksAndBodyIndexA")
KERNEL_DEF(RADIXSORT_SINGLEBLOCK, "bodyInputAndRanksSingleBlockLaunch")
KERNEL_DEF(RADIXSORT_CALCULATERANKS, "bodyInputAndRanksBlocksLaunch")
KERNEL_DEF(REORGANIZE_THRESHOLDSTREAM, "reorganizeThresholdElements")
KERNEL_DEF(COMPUTE_ACCUMULATED_THRESHOLDSTREAM, "computeAccumulateThresholdStream")
KERNEL_DEF(OUTPUT_ACCUMULATED_THRESHOLDSTREAM, "outputAccumulateThresholdStream")
KERNEL_DEF(WRITEOUT_ACCUMULATEDFORCEPEROBJECT, "writeoutAccumulatedForcePerObject")
KERNEL_DEF(COMPUTE_EXCEEDEDFORCE_THRESHOLDELEMENT_INDICE, "computeExceededForceThresholdElementIndice")
KERNEL_DEF(OUTPUT_EXCEEDEDFORCE_THRESHOLDELEMENT_INDICE, "outputExceededForceThresholdElementIndice")
KERNEL_DEF(SET_THRESHOLDELEMENT_MASK, "setThresholdElementsMask")
KERNEL_DEF(COMPUTE_THRESHOLDELEMENT_MASK_INDICES, "computeThresholdElementMaskIndices")
KERNEL_DEF(OUTPUT_THRESHOLDELEMENT_MASK_INDICES, "outputThresholdPairsMaskIndices")
KERNEL_DEF(CREATE_FORCECHANGE_THRESHOLDELEMENTS, "createForceChangeThresholdElements")
KERNEL_DEF(SOLVE_UNIFIED, "solveBlockUnified")
KERNEL_DEF(PROPAGATE_STATIC_SOLVER_VELOCITIES, "propagateStaticSolverBodyVelocities")
////////////////////////////////////////////////////////////////
//integration kernels
///////////////////////////////////////////////////////////////
KERNEL_DEF(INTEGRATE_CORE_PARALLEL, "integrateCoreParallelLaunch")
KERNEL_DEF(CLEAR_FRICTION_PATCH_COUNTS, "clearFrictionPatchCounts")
KERNEL_DEF(DMA_CHANGED_ELEMS, "dmaBackChangedElems")
KERNEL_DEF(BP_SIGNAL_COMPLETE, "bpSignalComplete")
KERNEL_DEF(DMA_CONSTRAINT_RESIDUAL, "dmaConstraintResidual")
KERNEL_DEF(DMA_ARTICULATION_RESIDUAL, "dmaArticulationResidual")
////////////////////////////////////////////////////////////////
//articulation kernels
//////////////////////////////////////////////////////////////
KERNEL_DEF(ARTI_DMA_DATA, "dmaBackArticulationDataLaunch")
KERNEL_DEF(ARTI_STATIC_BATCH_PREP_FIRST, "artiSumInternalContactAndJointBatches1Launch")
KERNEL_DEF(ARTI_STATIC_BATCH_PREP_SECOND, "artiSumInternalContactAndJointBatches2Launch")
KERNEL_DEF(ARTI_SOLVE_INTERNAL_CONSTRAINTS, "artiSolveInternalConstraints1T")
KERNEL_DEF(ARTI_SOLVE_INTERNAL_CONSTRAINTS_TGS, "artiSolveInternalConstraintsTGS1T")
KERNEL_DEF(ARTI_SOLVE_INTERNAL_TENDON_AND_MIMIC_JOINT, "artiSolveInternalTendonAndMimicJointConstraints1T")
KERNEL_DEF(ARTI_COMPUTE_UNCONSTRAINED, "computeUnconstrainedVelocities1TLaunch")
KERNEL_DEF(ARTI_COMPUTE_SPATIAL_PARTIAL, "computeUnconstrainedSpatialInertiaLaunchPartial1T")
KERNEL_DEF(ARTI_COMPUTE_UNCONSTRAINED_SPATIAL_INERTIA, "computeUnconstrainedSpatialInertiaLaunch1T")
KERNEL_DEF(ARTI_COMPUTE_MASS_MATRIX, "computeMassMatrix1T")
KERNEL_DEF(ARTI_COMPUTE_UNCONSTRAINED_ACCEL, "computeUnconstrainedAccelerationsLaunch1T")
KERNEL_DEF(ARTI_SAVE_VELOCITY_PGS, "artiSaveVelocity1TPGS")
KERNEL_DEF(ARTI_UPDATE_BODIES, "updateBodiesLaunch1T")
KERNEL_DEF(ARTI_UPDATE_BODIES2, "updateBodiesLaunch_Part2")
KERNEL_DEF(ARTI_UPDATE_BODIES3, "updateBodiesLaunch_Part3")
KERNEL_DEF(ARTI_SETUP_INTERNAL, "setupInternalConstraintLaunch1T")
KERNEL_DEF(ARTI_CONTACT_PREP, "artiContactConstraintBlockPrepareLaunch")
KERNEL_DEF(ARTI_JOINT_PREP, "artiJointConstraintBlockPrepareParallelLaunch")
KERNEL_DEF(ARTI_SOLVE_BLOCK_PARTITION, "artiSolveBlockPartition")
KERNEL_DEF(ARTI_UPDATE_KINEMATIC, "artiUpdateKinematic")
KERNEL_DEF(ARTI_STEP_TGS, "stepArticulation1TTGS")
KERNEL_DEF(ARTI_CONTACT_PREP_TGS, "artiTGSContactConstraintBlockPrepareLaunch")
KERNEL_DEF(ARTI_JOINT_PREP_TGS, "artiTGSJointConstraintBlockPrepareParallelLaunch")
KERNEL_DEF(ARTI_OUTPUT_VELOCITY, "artiOutputVelocity")
KERNEL_DEF(ARTI_PUSH_IMPULSE, "artiPushImpulse")
KERNEL_DEF(ARTI_COMPUTE_DEPENDENCIES, "artiComputeDependencies")
KERNEL_DEF(ARTI_PROPAGATE_IMPULSE_PGS, "artiPropagateImpulses2PGS")
KERNEL_DEF(ARTI_PROPAGATE_IMPULSE_TGS, "artiPropagateImpulses2TGS")
KERNEL_DEF(ARTI_PROPAGATE_VELOCITY, "artiPropagateVelocity")
KERNEL_DEF(ARTI_PROPAGATE_VELOCITY_TGS, "artiPropagateVelocityTGS")
KERNEL_DEF(ARTI_SUM_SELF, "artiSumSelfContactAndJointBatches")
KERNEL_DEF(ARTI_PROPAGATE_RIGID_IMPULSES_AND_SOLVE_SELF, "artiPropagateRigidImpulsesAndSolveSelfConstraints1T")
KERNEL_DEF(ARTI_PROPAGATE_RIGID_IMPULSES_AND_SOLVE_SELF_TGS, "artiPropagateRigidImpulsesAndSolveSelfConstraintsTGS1T")
KERNEL_DEF(ARTI_APPLY_TGS_SUBSTEP_FORCES, "artiApplyTgsSubstepForces")
////////////////////////////////////////////////////////////////
//TGS solver kernels
///////////////////////////////////////////////////////////////
KERNEL_DEF(ZERO_BODIES_TGS, "ZeroBodiesTGS")
KERNEL_DEF(CONCLUDE_BLOCKS_TGS, "concludeBlocksTGS")
KERNEL_DEF(WRITEBACK_BLOCKS_TGS, "writebackBlocksTGS")
KERNEL_DEF(WRITE_BACK_BODIES_TGS, "writeBackBodiesTGS")
KERNEL_DEF(COMPUTE_AVERAGE_VELOCITY_TGS, "computeAverageSolverBodyVelocityTGS")
KERNEL_DEF(INTEGRATE_CORE_PARALLEL_TGS, "integrateCoreParallelLaunchTGS")
KERNEL_DEF(INIT_STATIC_KINEMATICS, "initStaticKinematics")
KERNEL_DEF(TGS_PRE_INTEGRATION, "preIntegrationLaunchTGS")
KERNEL_DEF(TGS_INIT_SOLVER_VELS, "initializeSolverVelocitiesTGS")
KERNEL_DEF(TGS_JOINT_CONSTRAINT_PREPARE_BLOCK_PARALLEL, "jointConstraintBlockPrepareParallelLaunchTGS")
KERNEL_DEF(TGS_CONTACT_CONSTRAINT_PREPARE_BLOCK_PARALLEL, "contactConstraintBlockPrepareParallelLaunchTGS")
KERNEL_DEF(PROPAGATE_AVERAGE_VELOCITY_TGS, "propagateAverageSolverBodyVelocityTGS")
KERNEL_DEF(PROPAGATE_STATIC_SOLVER_VELOCITIES_TGS, "propagateStaticSolverBodyVelocitiesTGS")
KERNEL_DEF(APPLY_TGS_SUBSTEP_GRAVITY, "applyTGSSubstepGravity")
KERNEL_DEF(MARK_ACTIVE_SLAB_PGS, "markActiveSlabPGS")
KERNEL_DEF(MARK_ACTIVE_SLAB_TGS, "markActiveSlabTGS")
//////////////////////////////////////////////////////////////////
//radix sort kernels
//////////////////////////////////////////////////////////////////
KERNEL_DEF(RS_MULTIBLOCK, "radixSortMultiBlockLaunch")
KERNEL_DEF(RS_CALCULATERANKS_MULTIBLOCK, "radixSortMultiCalculateRanksLaunch")
KERNEL_DEF(RS_MULTIBLOCK_COUNT, "radixSortMultiBlockLaunchWithCount")
KERNEL_DEF(RS_CALCULATERANKS_MULTIBLOCK_COUNT, "radixSortMultiCalculateRanksLaunchWithCount")
KERNEL_DEF(RS_MULTIBLOCK_NO_COUNT, "radixSortMultiBlockLaunchWithoutCount")
KERNEL_DEF(RS_CALCULATERANKS_MULTIBLOCK_NO_COUNT, "radixSortMultiCalculateRanksLaunchWithoutCount")
KERNEL_DEF(RS_COPY_HIGH_32BITS, "radixSortCopyHigh32Bits")
KERNEL_DEF(RS_DOUBLE_COPY_HIGH_32BITS2, "radixSortDoubleCopyHigh32Bits")
KERNEL_DEF(RS_COPY_VALUE, "radixSortCopy")
KERNEL_DEF(RS_DOUBLE_COPY_VALUE, "radixSortDoubleCopy")
KERNEL_DEF(RS_COPY_BITS2, "radixSortCopyBits2")
KERNEL_DEF(RS_COPY_VALUE2, "radixSortCopy2")
/////////////////////////////////////////////////////////////////
//accumulate rigid body delta velocity kernels
//those are shared by soft body and particle system
////////////////////////////////////////////////////////////////
KERNEL_DEF(ACCUMULATE_DELTAVEL_RIGIDBODY_FIRST, "accumulateDeltaVRigidFirstLaunch")
KERNEL_DEF(ACCUMULATE_DELTAVEL_RIGIDBODY_SECOND, "accumulateDeltaVRigidSecondLaunch")
KERNEL_DEF(ACCUMULATE_DELTAVEL_RIGIDBODY_SECOND_MULTI1, "accumulateDeltaVRigidSecondLaunchMultiStage1")
KERNEL_DEF(ACCUMULATE_DELTAVEL_RIGIDBODY_SECOND_MULTI2, "accumulateDeltaVRigidSecondLaunchMultiStage2")
KERNEL_DEF(ACCUMULATE_DELTAVEL_RIGIDBODY_MULTI_CLEAR, "clearDeltaVRigidSecondLaunchMulti")
KERNEL_DEF(RIGID_SUM_STATIC_CONTACT1, "rigidSumInternalContactAndJointBatches1")
KERNEL_DEF(RIGID_SUM_STATIC_CONTACT2, "rigidSumInternalContactAndJointBatches2")
KERNEL_DEF(RIGID_SOLVE_STATIC_CONSTRAINTS, "solveStaticBlock")
KERNEL_DEF(RIGID_SOLVE_STATIC_CONSTRAINTS_TGS, "solveStaticBlockTGS")
KERNEL_DEF(RIGID_SOLVE_WHOLE_ISLAND_TGS, "solveWholeIslandTGS")
////////////////////////////////////////////////////////////////
//particle system kernels
///////////////////////////////////////////////////////////////
KERNEL_DEF(PS_UPDATE_UNSORTED_ARRAY, "ps_updateUnsortedArrayLaunch")
KERNEL_DEF(PS_UPDATE_BUFFER_DATA, "ps_updateUserBufferLaunch")
KERNEL_DEF(PS_UPDATE_DIFFUSE_UNSORTED_ARRAY, "ps_updateUnsortedDiffuseArrayLaunch")
KERNEL_DEF(PS_PREINTEGRATION, "ps_preIntegrateLaunch")
KERNEL_DEF(PS_PRE_DIFFUSE_INTEGRATION, "ps_preDiffuseIntegrateLaunch")
KERNEL_DEF(PS_UPDATE_BOUND_FRIST, "ps_updateBoundFirstPassLaunch")
KERNEL_DEF(PS_UPDATE_BOUND_SECOND, "ps_updateBoundSecondPassLaunch")
KERNEL_DEF(PS_CALCULATE_HASH, "ps_calculateHashLaunch")
KERNEL_DEF(PS_CALCULATE_HASH_FOR_DIFFUSE_PARTICLES, "ps_calculateHashForDiffuseParticlesLaunch")
KERNEL_DEF(PS_REORDER_PARTICLE_FIND_CELLSTARTEND, "ps_reorderDataAndFindCellStartLaunch")
KERNEL_DEF(PS_PARTICLE_SELF_COLLISION, "ps_selfCollisionLaunch")
KERNEL_DEF(PS_PRIMITIVES_BOUND_FIRST, "ps_primitivesBoundFirstPassLaunch")
KERNEL_DEF(PS_PRIMITIVES_BOUND_SECOND, "ps_primitivesBoundSecondPassLaunch")
KERNEL_DEF(PS_PRIMITIVES_COLLISION, "ps_primitivesCollisionLaunch")
KERNEL_DEF(PS_CONVEX_COLLISION, "ps_convexCollisionLaunch")
KERNEL_DEF(PS_PRIMITIVES_DIFFUSE_COLLISION, "ps_primitivesDiffuseCollisionLaunch")
KERNEL_DEF(PS_CONVEX_DIFFUSE_COLLISION, "ps_convexDiffuseCollisionLaunch")
KERNEL_DEF(PS_TRIMESH_COLLISION, "ps_meshCollisonLaunch")
KERNEL_DEF(PS_SDF_TRIMESH_COLLISION, "ps_sdfMeshCollisonLaunch")
KERNEL_DEF(PS_HEIGHTFIELD_COLLISION, "ps_heightfieldCollisonLaunch")
KERNEL_DEF(PS_REORDER_PRIMITIVE_CONTACTS, "ps_reorderPrimitiveContactsLaunch")
KERNEL_DEF(PS_CONTACT_PREPARE, "ps_contactPrepareLaunch")
KERNEL_DEF(PS_SOLVE_PC_PARTICLE, "ps_solvePCOutputParticleDeltaVLaunch")
KERNEL_DEF(PS_SOLVE_PC_RIGID, "ps_solvePCOutputRigidDeltaVLaunch")
KERNEL_DEF(PS_SOLVE_ONEWAY_CONTACTS, "ps_solveOneWayContactDeltaVLaunch")
KERNEL_DEF(PS_FIND_RANGESTARTEND_PARTICLE_FIRST, "ps_findStartEndParticleFirstLaunch")
KERNEL_DEF(PS_FIND_RANGESTARTEND_PARTICLE_SECONE, "ps_findStartEndParticleSecondLaunch")
KERNEL_DEF(PS_ACCUMULATE_DELTAVEL_PARTICLE, "ps_accumulateDeltaVParticleLaunch")
KERNEL_DEF(PS_UPDATE_PARTICLE, "ps_updateParticleLaunch")
KERNEL_DEF(PS_INTEGRATION, "ps_integrateLaunch")
KERNEL_DEF(PS_CALCULATE_DENSITIES_AND_POTENTIALS, "ps_calculateDensityAndPotentialLaunch")
KERNEL_DEF(PS_SOLVE_DENSITIES, "ps_solveDensityLaunch")
KERNEL_DEF(PS_APPLY_DELTAS, "ps_applyDeltaLaunch")
KERNEL_DEF(PS_VORTICITY_CONFINEMENT, "ps_vorticityConfinementLaunch")
KERNEL_DEF(PS_SOLVE_VELOCITIES, "ps_solveVelocityLaunch")
KERNEL_DEF(PS_UPDATE_REMAP_VERTS, "ps_updateRemapVertsLaunch")
KERNEL_DEF(PS_SOLVE_SPRINGS, "ps_solveSpringsLaunch")
KERNEL_DEF(PS_INITIALIZE_SPRINGS, "ps_initializeSpringsLaunch")
KERNEL_DEF(PS_AVERAGEVERTS, "ps_averageVertsLaunch")
KERNEL_DEF(PS_AERODYNAMIC_1, "ps_solveAerodynamics1Launch")
KERNEL_DEF(PS_AERODYNAMIC_2, "ps_solveAerodynamics2Launch")
KERNEL_DEF(PS_CALCULATE_INFLATABLE_VOLUME, "ps_calculateInflatableVolume")
KERNEL_DEF(PS_SOLVE_INFLATABLE_VOLUME, "ps_solveInflatableVolume")
KERNEL_DEF(PS_SOLVE_SHAPES, "ps_solveShapes")
KERNEL_DEF(PS_PREP_RIGID_ATTACHMENTS, "ps_rigidAttachmentPrepareLaunch")
KERNEL_DEF(PS_SOLVE_RIGID_ATTACHMENTS, "ps_solveRigidAttachmentsLaunch")
KERNEL_DEF(PS_UPDATE_VOLUME_BOUND, "ps_update_volume_bound")
KERNEL_DEF(PS_UPDATE_SPRING, "ps_updateSprings")
KERNEL_DEF(PS_ACCUMULATE_STATIC_DENSITY, "ps_accumulateStaticDensity")
KERNEL_DEF(PS_ACCUMULATE_RIGID_DENSITY, "ps_accumulateRigidDensity")
KERNEL_DEF(PS_DIFFUSE_PARTICLES_ONE_WAY_COLLISION, "ps_diffuseParticleOneWayCollision")
KERNEL_DEF(PS_DIFFUSE_PARTICLES_UPDATE_PBF, "ps_diffuseParticleUpdatePBF")
KERNEL_DEF(PS_DIFFUSE_PARTICLES_COMPACT, "ps_diffuseParticleCompact")
KERNEL_DEF(PS_DIFFUSE_PARTICLES_CREATE, "ps_diffuseParticleCreate")
KERNEL_DEF(PS_DIFFUSE_PARTICLES_COPY, "ps_diffuseParticleCopy")
KERNEL_DEF(PS_DIFFUSE_PARTICLES_SUM, "ps_diffuseParticleSum")
KERNEL_DEF(PS_FIND_RANGESTARTEND_FEM_FIRST, "ps_findStartEndFEMFirstLaunch")
KERNEL_DEF(PS_RANGESTARTEND_FEM_SECONE, "ps_findStartEndFEMSecondLaunch")
KERNEL_DEF(PS_ACCUMULATE_FEM_PARTICLE_DELTA, "ps_accumulateFEMParticleDeltaVLaunch")
KERNEL_DEF(PS_STEP_PARTICLES, "ps_stepParticlesLaunch")
KERNEL_DEF(PS_SOLVE_PC_PARTICLE_TGS, "ps_solvePCOutputParticleDeltaVTGSLaunch")
KERNEL_DEF(PS_SOLVE_PC_RIGID_TGS, "ps_solvePCOutputRigidDeltaVTGSLaunch")
KERNEL_DEF(PS_SOLVE_RIGID_ATTACHMENTS_TGS, "ps_solveRigidAttachmentsTGSLaunch")
KERNEL_DEF(PS_FINALIZE_PARTICLES, "ps_finalizeParticlesLaunch")
KERNEL_DEF(PS_UPDATE_MATERIALS, "ps_updateMaterials")
//////////////////////////////////////////////////////////
//fem shared kernel names
//////////////////////////////////////////////////////////
KERNEL_DEF(FEM_ATTACHMENT_CONSTRAINT_PREP, "femAttachmentPrepareLaunch")
//////////////////////////////////////////////////////////
//softbody kernel names
//////////////////////////////////////////////////////////
KERNEL_DEF(SB_SIM_PREINTEGRATION, "sb_gm_preIntegrateLaunch")
KERNEL_DEF(SB_REFITBOUND, "sb_refitBoundLaunch")
KERNEL_DEF(SB_MIDPHASE_PRIMITIVES, "sb_midphaseGeneratePairsLaunch")
KERNEL_DEF(SB_PRIMITIVES_CG, "sb_primitiveContactGenLaunch")
KERNEL_DEF(SB_SB_MIDPHASE, "sb_sbMidphaseGeneratePairsLaunch")
KERNEL_DEF(SB_PS_MIDPHASE, "sb_psMidphaseGeneratePairsLaunch")
KERNEL_DEF(SB_PS_CG, "sb_psContactGenLaunch")
KERNEL_DEF(SB_CLOTH_MIDPHASE, "sb_clothMidphaseGeneratePairsLaunch")
KERNEL_DEF(SB_CLOTH_CG, "sb_clothContactGenLaunch")
KERNEL_DEF(SB_CLOTH_VERT_MIDPHASE, "sb_clothVertMidphaseGeneratePairsLaunch")
KERNEL_DEF(SB_CLOTH_VERT_CG, "sb_clothVertContactGenLaunch")
KERNEL_DEF(SB_MESH_MIDPHASE, "sb_meshMidphaseGeneratePairsLaunch")
KERNEL_DEF(SB_SDF_MESH_MIDPHASE, "sb_sdfMeshMidphaseGeneratePairsLaunch")
KERNEL_DEF(SB_MESH_CG, "sb_meshContactGenLaunch")
KERNEL_DEF(SB_HF_MIDPHASE, "sb_heightfieldMidphaseGeneratePairsLaunch")
KERNEL_DEF(SB_HF_CG, "sb_heightfieldContactGenLaunch")
KERNEL_DEF(SB_SELFCOLLISION_MIDPHASE, "sb_selfCollisionMidphaseGeneratePairsLaunch")
KERNEL_DEF(SB_REORDER_PS_CONTACTS, "sb_reorderPSContactsLaunch")
KERNEL_DEF(SB_RIGID_ATTACHMENT_CONSTRAINT_PREP, "sb_rigidAttachmentPrepareLaunch")
KERNEL_DEF(SB_RS_CONTACTPREPARE, "sb_rigidContactPrepareLaunch")
KERNEL_DEF(SB_SS_CONTACTPREPARE, "sb_softbodyContactPrepareLaunch")
KERNEL_DEF(SB_SC_CONTACTPREPARE, "sb_clothContactPrepareLaunch")
KERNEL_DEF(SB_SP_CONTACTPREPARE, "sb_particleContactPrepareLaunch")
KERNEL_DEF(SB_SOLVE_RIGID_SOFT_ATTACHMENT, "sb_solveRigidSoftAttachmentLaunch")
KERNEL_DEF(SB_SOLVE_SOFTBODY_ATTACHMENT_DELTA, "sb_solveOutputSoftBodyAttachmentDeltaVLaunch")
KERNEL_DEF(SB_SOLVE_CLOTH_ATTACHMENT_DELTA, "sb_solveOutputClothAttachmentDeltaVLaunch")
KERNEL_DEF(SB_SOLVE_PARTICLE_ATTACHMENT_DELTA, "sb_solveOutputParticleAttachmentDeltaVLaunch")
KERNEL_DEF(SB_QUERY_RIGID_SOFT_REFERENCE_COUNT, "sb_queryRigidSoftContactReferenceCountLaunch")
KERNEL_DEF(SB_SOLVE_RIGID_SOFT_COLLISION, "sb_solveRigidSoftCollisionLaunch")
KERNEL_DEF(SB_SOLVE_SOFT_SOFT_BOTH_DELTA, "sb_solveOutputSSDeltaVLaunch")
KERNEL_DEF(SB_SOLVE_SOFT_CLOTH_BOTH_DELTA, "sb_solveOutputSCDeltaVLaunch")
KERNEL_DEF(SB_SOLVE_PARTICLE_SOFT_DELTA, "sb_solveOutputSPDeltaVLaunch")
KERNEL_DEF(SB_SOLVE_PARTICLE_PARTICLE_DELTA, "sb_solveOutputParticleDeltaVLaunch")
KERNEL_DEF(SB_GM_CP_SOLVE_TETRA, "sb_gm_cp_solveTetrahedronsPartitionLaunch")
KERNEL_DEF(SB_GM_CP_SOLVE_TETRA_JACOBI_PARTITION, "sb_gm_cp_solveTetrahedronsJacobiPartitionLaunch")
KERNEL_DEF(SB_GM_UPDATETETMODELVERTS, "sb_gm_updateTetModelVertsLaunch")
KERNEL_DEF(SB_GM_ZERO_TETMULTIPLIERS, "sb_gm_zeroTetMultipliers")
KERNEL_DEF(SB_GM_CP_AVERAGEVERTS, "sb_gm_cp_averageVertsLaunch")
KERNEL_DEF(SB_UPDATETETRAROTATIONS, "sb_updateTetrahedraRotationsLaunch")
KERNEL_DEF(SB_GM_UPDATETETRAROTATIONS, "sb_gm_updateTetrahedraRotationsLaunch")
KERNEL_DEF(SB_GM_APPLY_EXTERNAL_DELTAS, "sb_gm_applyExternalDeltasLaunch")
KERNEL_DEF(SB_GM_APPLY_DEFORMATION_DELTAS, "sb_gm_applyDeformationDeltasLaunch")
KERNEL_DEF(SB_OTHER_CONTACT_REMAP_TO_SIM, "sb_other_contact_remap_to_simLaunch")
KERNEL_DEF(SB_FEM_CONTACT_REMAP_TO_SIM, "sb_fem_contact_remap_to_simLaunch")
KERNEL_DEF(SB_GM_STEPSOFTBODY, "sb_gm_stepSoftbodyLaunch")
KERNEL_DEF(SB_SOLVE_RIGID_SOFT_ATTACHMENT_TGS, "sb_solveRigidSoftAttachmentLaunchTGS")
KERNEL_DEF(SB_QUERY_RIGID_SOFT_REFERENCE_COUNT_TGS, "sb_queryRigidSoftContactReferenceCountLaunchTGS")
KERNEL_DEF(SB_SOLVE_RIGID_SOFT_COLLISION_TGS, "sb_solveRigidSoftCollisionLaunchTGS")
KERNEL_DEF(SB_SOLVE_SOFT_SOFT_BOTH_DELTA_TGS, "sb_solveOutputSSDeltaVLaunchTGS")
KERNEL_DEF(SB_CALC_STRESS, "sb_calculateStressLaunch")
KERNEL_DEF(SB_PLASTIC_DEFORM, "sb_plasticDeformLaunch")
KERNEL_DEF(SB_INIT_PLASTIC_DEFORM, "sb_initPlasticDeformLaunch")
KERNEL_DEF(SB_PLASTIC_DEFORM2, "sb_plasticDeformLaunch2")
KERNEL_DEF(SB_COPY_OR_APPLY_SOFTBODY_DATA_DEPRECATED, "sb_copyOrApplySoftBodyDataDEPRECATED")
KERNEL_DEF(SB_GM_FINALIZE_VELOCITIES, "sb_gm_finalizeVelocitiesLaunch")
KERNEL_DEF(SB_SLEEPING, "sb_sleeping")
//////////////////////////////////////////////////////////
// Direct API
//////////////////////////////////////////////////////////
KERNEL_DEF(COMPRESS_CONTACT_STAGE_1, "compressContactStage1")
KERNEL_DEF(COMPRESS_CONTACT_STAGE_2, "compressContactStage2")
KERNEL_DEF(COMPUTE_ARTI_DENSE_JACOBIANS, "computeArtiDenseJacobians")
KERNEL_DEF(COMPUTE_ARTI_MASS_MATRICES, "computeArtiMassMatrices")
KERNEL_DEF(COMPUTE_ARTI_GRAVITY_FORCES, "computeArtiGravityForces")
KERNEL_DEF(COMPUTE_ARTI_CENTRIFUGAL_FORCES, "computeArtiCentrifugalForces")
KERNEL_DEF(COMPUTE_ARTI_COM, "computeArtiCOM")
KERNEL_DEF(COMPUTE_ARTI_CENTROIDAL_MATRICES, "computeArtiCentroidalMomentumMatrices")
KERNEL_DEF(APPLY_PARTICLE_BUFFER_DATA_DEPRECATED, "applyParticleBufferDataDEPRECATED")
KERNEL_DEF(RIGID_DYNAMIC_GET_GLOBAL_POSE, "getRigidDynamicGlobalPose")
KERNEL_DEF(RIGID_DYNAMIC_GET_LINVEL, "getRigidDynamicLinearVelocity")
KERNEL_DEF(RIGID_DYNAMIC_GET_ANGVEL, "getRigidDynamicAngularVelocity")
KERNEL_DEF(RIGID_DYNAMIC_GET_LINACCEL, "getRigidDynamicLinearAcceleration")
KERNEL_DEF(RIGID_DYNAMIC_GET_ANGACCEL, "getRigidDynamicAngularAcceleration")
KERNEL_DEF(RIGID_DYNAMIC_SET_GLOBAL_POSE, "setRigidDynamicGlobalPose")
KERNEL_DEF(RIGID_DYNAMIC_SET_LINVEL, "setRigidDynamicLinearVelocity")
KERNEL_DEF(RIGID_DYNAMIC_SET_ANGVEL, "setRigidDynamicAngularVelocity")
KERNEL_DEF(RIGID_DYNAMIC_SET_FORCE, "setRigidDynamicForce")
KERNEL_DEF(RIGID_DYNAMIC_SET_TORQUE, "setRigidDynamicTorque")
KERNEL_DEF(ARTI_GET_DOF_STATES, "getArtiDofStates")
KERNEL_DEF(ARTI_GET_TRANSFORM_STATES, "getArtiTransformStates")
KERNEL_DEF(ARTI_GET_VELOCITY_STATES, "getArtiVelocityStates")
KERNEL_DEF(ARTI_GET_SPATIAL_FORCE_STATES, "getArtiSpatialForceStates")
KERNEL_DEF(ARTI_SET_DOF_STATES, "setArtiDofStates")
KERNEL_DEF(ARTI_SET_ROOT_GLOBAL_POSE_STATE, "setArtiRootGlobalPoseState")
KERNEL_DEF(ARTI_SET_ROOT_VELOCITY_STATE, "setArtiRootVelocityState")
KERNEL_DEF(ARTI_SET_LINK_FORCE_STATE, "setArtiLinkForceState")
KERNEL_DEF(ARTI_SET_LINK_TORQUE_STATE, "setArtiLinkTorqueState")
KERNEL_DEF(ARTI_SET_TENDON_STATE, "setArtiTendonState")
KERNEL_DEF(ARTI_SET_SPATIAL_TENDON_ATTACHMENT_STATE, "setArtiSpatialTendonAttachmentState")
KERNEL_DEF(ARTI_SET_FIXED_TENDON_JOINT_STATE, "setArtiFixedTendonJointState")
KERNEL_DEF(ARTI_GET_TENDON_STATE, "getArtiTendonState")
KERNEL_DEF(ARTI_GET_SPATIAL_TENDON_ATTACHMENT_STATE, "getArtiSpatialTendonAttachmentState")
KERNEL_DEF(ARTI_GET_FIXED_TENDON_JOINT_STATE, "getArtiFixedTendonJointState")
KERNEL_DEF(D6_JOINT_GET_FORCE, "getD6JointForces")
KERNEL_DEF(D6_JOINT_GET_TORQUE, "getD6JointTorques")
//////////////////////////////////////////////////////////
// Aggregate kernels
/////////////////////////////////////////////////////////
KERNEL_DEF(UPDATE_DIRTY_AGGREGATE, "updateDirtyAggregate")
KERNEL_DEF(UPDATE_AGGREGATE_BOUND, "updateAggregateBounds")
KERNEL_DEF(MARK_AGGREGATE_BOUND_BITMAP, "markAggregateBoundsUpdatedBitmap")
KERNEL_DEF(AGG_SORT_UPDATE_PROJECTIONS, "sortAndUpdateAggregateProjections")
KERNEL_DEF(AGG_SELF_COLLISION, "doSelfCollision")
KERNEL_DEF(AGG_ADD_AGGPAIRS_STAGE_1, "addAggPairsStage1")
KERNEL_DEF(AGG_ADD_AGGPAIRS_STAGE_2, "addAggPairsStage2")
KERNEL_DEF(AGG_PAIR_COLLISION, "doAggPairCollisions")
KERNEL_DEF(AGG_REMOVE_AGGPAIRS_STAGE_1, "removeAggPairsStage1")
KERNEL_DEF(AGG_REMOVE_AGGPAIRS_STAGE_2, "removeAggPairsStage2")
KERNEL_DEF(AGG_REMOVE_AGGPAIRS_STAGE_3, "removeAggPairsStage3")
KERNEL_DEF(AGG_COPY_REPORTS, "aggCopyReports")
KERNEL_DEF(CLEAR_DIRTY_AGGS, "clearDirtyAggregates")
KERNEL_DEF(COPY_USER_DATA, "copyUserData")
KERNEL_DEF(AGG_MARK_ADDED_DELETED_AGGREGATED_BOUNDS, "markAddedAndDeletedAggregatedBounds")
//////////////////////////////////////////////////////////
//FEM-cloth kernel names
//////////////////////////////////////////////////////////
KERNEL_DEF(CLOTH_SIM_PREINTEGRATION, "cloth_preIntegrateLaunch")
KERNEL_DEF(CLOTH_REFIT_BOUND, "cloth_refitBoundLaunch")
KERNEL_DEF(CLOTH_MIDPHASE_PRIMITIVES, "cloth_midphaseGeneratePairsLaunch")
KERNEL_DEF(CLOTH_SPHERE_CG, "cloth_SphereContactGenLaunch")
KERNEL_DEF(CLOTH_BOX_TRIANGLE_CG, "cloth_boxTriangleContactGenLaunch")
KERNEL_DEF(CLOTH_CONVEX_CG, "cloth_convexContactGenLaunch")
KERNEL_DEF(CLOTH_VERT_SPHERE_CG, "cloth_SphereVertexContactGenLaunch")
KERNEL_DEF(CLOTH_PLANE_VERTEX_CG, "cloth_planeVertContactGenLaunch")
KERNEL_DEF(CLOTH_MIDPHASE_VERTEX_PRIMS, "cloth_midphaseGenerateVertexPairsLaunch")
KERNEL_DEF(CLOTH_BOX_VERTEX_COLLISION, "cloth_boxVertexContactGenLaunch")
KERNEL_DEF(CLOTH_CONVEX_VERTEX_COLLISION, "cloth_convexVertexContactGenLaunch")
KERNEL_DEF(CLOTH_SELFCOLLISION_MIDPHASE_VT, "cloth_selfCollisionMidphaseVTLaunch")
KERNEL_DEF(CLOTH_SELFCOLLISION_MIDPHASE_EE, "cloth_selfCollisionMidphaseEELaunch")
KERNEL_DEF(CLOTH_DIFFERENTCLOTHCOLLISION_MIDPHASE_VT, "cloth_differentClothCollisionVTLaunch")
KERNEL_DEF(CLOTH_DIFFERENTCLOTHCOLLISION_MIDPHASE_EE, "cloth_differentClothCollisionEELaunch")
KERNEL_DEF(CLOTH_CLOTH_MIDPHASE, "cloth_clothMidphaseGeneratePairsLaunch")
KERNEL_DEF(CLOTH_PS_MIDPHASE, "cloth_psMidphaseGeneratePairsLaunch")
KERNEL_DEF(CLOTH_PS_CG, "cloth_psContactGenLaunch")
KERNEL_DEF(CLOTH_MESH_MIDPHASE, "cloth_meshMidphaseGeneratePairsLaunch")
KERNEL_DEF(CLOTH_MESH_CG, "cloth_meshContactGenLaunch")
KERNEL_DEF(CLOTH_SDF_MESH_CG, "cloth_sdfMeshContactGenLaunch")
KERNEL_DEF(CLOTH_MIDPHASE_VERTEX_MESH, "cloth_midphaseVertexMeshLaunch")
KERNEL_DEF(CLOTH_MESH_VERTEX_CG, "cloth_meshVertexContactGenLaunch")
KERNEL_DEF(CLOTH_HF_MIDPHASE, "cloth_heightfieldMidphaseGeneratePairsLaunch")
KERNEL_DEF(CLOTH_HF_CG, "cloth_heightfieldContactGenLaunch")
KERNEL_DEF(CLOTH_MIDPHASE_VERTEX_HF, "cloth_midphaseVertexHeightfieldLaunch")
KERNEL_DEF(CLOTH_HF_VERTEX_CG, "cloth_heightfieldVertexContactGenLaunch")
KERNEL_DEF(CLOTH_RIGID_ATTACHMENT_CONSTRAINT_PREP, "cloth_rigidAttachmentPrepareLaunch")
KERNEL_DEF(CLOTH_RIGID_CONTACTPREPARE, "cloth_rigidContactPrepareLaunch")
KERNEL_DEF(CLOTH_CLOTH_CONTACTPREPARE, "cloth_clothContactPrepareLaunch")
KERNEL_DEF(CLOTH_PARTICLE_CONTACTPREPARE, "cloth_particleContactPrepareLaunch")
KERNEL_DEF(CLOTH_SIM_NONSHARED_TRIANGLE_ENERGY_SOLVE, "cloth_solveNonSharedTriangleEnergyLaunch")
KERNEL_DEF(CLOTH_SIM_NONSHARED_TRIANGLE_ENERGY_SOLVE_CLUSTER, "cloth_solveNonSharedTriangleEnergyClusterLaunch")
KERNEL_DEF(CLOTH_SIM_TRIANGLEPAIR_ENERGY_SOLVE, "cloth_solveTrianglePairEnergyLaunch")
KERNEL_DEF(CLOTH_SIM_TRIANGLEPAIR_ENERGY_SOLVE_CLUSTER, "cloth_solveTrianglePairEnergyClusterLaunch")
KERNEL_DEF(CLOTH_SIM_TRIANGLEPAIR_AVERAGE_VERTS, "cloth_averageTrianglePairVertsLaunch")
KERNEL_DEF(CLOTH_SIM_STEP, "cloth_stepLaunch")
KERNEL_DEF(CLOTH_QUERY_RIGID_CLOTH_REFERENCE_COUNT, "cloth_queryRigidClothContactReferenceCountLaunch")
KERNEL_DEF(CLOTH_SOLVE_RIGID_CLOTH_COLLISION, "cloth_solveRigidClothCollisionLaunch")
KERNEL_DEF(CLOTH_SOLVE_CLOTH_VT_COLLISION, "cloth_solveClothClothDeltaVTLaunch")
KERNEL_DEF(CLOTH_SOLVE_CLOTH_EE_COLLISION, "cloth_solveClothClothDeltaEELaunch")
KERNEL_DEF(CLOTH_QUERY_CLOTH_CONTACT_VT_COUNT, "cloth_queryClothClothContactVTCountLaunch")
KERNEL_DEF(CLOTH_QUERY_CLOTH_CONTACT_EE_COUNT, "cloth_queryClothClothContactEECountLaunch")
KERNEL_DEF(CLOTH_APPLY_EXTERNAL_DELTAS, "cloth_applyExternalDeltasLaunch")
KERNEL_DEF(CLOTH_UPDATE_CLOTH_CONTACT_VALIDITY, "cloth_updateClothContactValidityLaunch")
KERNEL_DEF(CLOTH_PARTICLE_CLOTH_DELTA, "cloth_solveCPOutputClothDeltaVLaunch")
KERNEL_DEF(CLOTH_PARTICLE_PARTICLE_DELTA, "cloth_solveCPOutputParticleDeltaVLaunch")
KERNEL_DEF(CLOTH_SOLVE_RIGID_CLOTH_ATTACHMENT, "cloth_solveRigidClothAttachmentLaunch")
KERNEL_DEF(CLOTH_SOLVE_ATTACHMENT_CLOTH_CLOTH_DELTA, "cloth_solveOutputAttachmentClothClothDeltaVLaunch")
KERNEL_DEF(CLOTH_FINALIZE_VELOCITIES, "cloth_finalizeVelocitiesLaunch")
KERNEL_DEF(CLOTH_SOLVE_RIGID_CLOTH_COLLISION_TGS, "cloth_solveRigidClothCollisionTGSLaunch")
KERNEL_DEF(CLOTH_QUERY_RIGID_CLOTH_REFERENCE_COUNT_TGS, "cloth_queryRigidClothContactReferenceCountTGSLaunch")
KERNEL_DEF(CLOTH_SOLVE_RIGID_CLOTH_ATTACHMENT_TGS, "cloth_solveRigidClothAttachmentTGSLaunch")
KERNEL_DEF(CLOTH_SLEEPING, "cloth_sleeping")
KERNEL_DEF(CLOTH_IN_PLANE_DAMPING, "cloth_accumulateInPlaneDampingDeltaVelocity")
KERNEL_DEF(CLOTH_BENDING_DAMPING, "cloth_accumulateBendingDampingDeltaVelocity")
KERNEL_DEF(CLOTH_APPLY_ACCUMULATED_DAMPING, "cloth_applyAccumulatedDeltaVelocity")
//////////////////////////////////////////////////////////////
//FEM common kernel
////////////////////////////////////////////////////////////
KERNEL_DEF(FEM_REORDER_RS_CONTACTS, "fem_reorderRigidContactsLaunch")
KERNEL_DEF(CLAMP_MAX_VALUE, "clampMaxValue")
KERNEL_DEF(CLAMP_MAX_VALUES, "clampMaxValues")
//////////////////////////////////////////////////////////////
// Gpu Extension Kernels
//////////////////////////////////////////////////////////////
KERNEL_DEF(scanPerBlockKernel, "scanPerBlockKernel")
KERNEL_DEF(scanPerBlockKernel4x4, "scanPerBlockKernel4x4")
KERNEL_DEF(addBlockSumsKernel, "addBlockSumsKernel")
KERNEL_DEF(addBlockSumsKernel4x4, "addBlockSumsKernel4x4")
KERNEL_DEF(radixFourBitCountPerBlockKernel, "radixFourBitCountPerBlockKernel")
KERNEL_DEF(radixFourBitReorderKernel, "radixFourBitReorderKernel")
KERNEL_DEF(reorderKernel, "reorderKernel")
KERNEL_DEF(smoothPositionsLaunch, "smoothPositionsLaunch")
KERNEL_DEF(calculateAnisotropyLaunch, "calculateAnisotropyLaunch")
KERNEL_DEF(anisotropyKernel, "anisotropyKernel")
KERNEL_DEF(smoothPositionsKernel, "smoothPositionsKernel")
KERNEL_DEF(iso_ComputeParticleDensityUsingSDF, "iso_ComputeParticleDensityUsingSDF")
KERNEL_DEF(iso_ComputeParticleDensityUsingSDFSparse, "iso_ComputeParticleDensityUsingSDFSparse")
KERNEL_DEF(iso_ComputeParticleDensity, "iso_ComputeParticleDensity")
KERNEL_DEF(iso_ComputeParticleDensitySparse, "iso_ComputeParticleDensitySparse")
KERNEL_DEF(iso_CountCellVerts, "iso_CountCellVerts")
KERNEL_DEF(iso_CountCellVertsSparse, "iso_CountCellVertsSparse")
KERNEL_DEF(iso_CountCellVertsDC, "iso_CountCellVertsDC")
KERNEL_DEF(iso_CountCellVertsDCSparse, "iso_CountCellVertsDCSparse")
KERNEL_DEF(iso_CreateVerts, "iso_CreateVerts")
KERNEL_DEF(iso_CreateVertsSparse, "iso_CreateVertsSparse")
KERNEL_DEF(iso_CreateVertsDC, "iso_CreateVertsDC")
KERNEL_DEF(iso_CreateVertsDCSparse, "iso_CreateVertsDCSparse")
KERNEL_DEF(iso_CountTriIds, "iso_CountTriIds")
KERNEL_DEF(iso_CountTriIdsSparse, "iso_CountTriIdsSparse")
KERNEL_DEF(iso_CountTriIdsDC, "iso_CountTriIdsDC")
KERNEL_DEF(iso_CountTriIdsDCSparse, "iso_CountTriIdsDCSparse")
KERNEL_DEF(iso_CreateTriIds, "iso_CreateTriIds")
KERNEL_DEF(iso_CreateTriIdsSparse, "iso_CreateTriIdsSparse")
KERNEL_DEF(iso_CreateTriIdsDC, "iso_CreateTriIdsDC")
KERNEL_DEF(iso_CreateTriIdsDCSparse, "iso_CreateTriIdsDCSparse")
KERNEL_DEF(iso_SmoothVerts, "iso_SmoothVerts")
KERNEL_DEF(iso_AverageVerts, "iso_AverageVerts")
KERNEL_DEF(iso_SmoothNormals, "iso_SmoothNormals")
KERNEL_DEF(iso_SmoothNormalsSparse, "iso_SmoothNormalsSparse")
KERNEL_DEF(iso_SmoothNormalsNormalize, "iso_SmoothNormalsNormalize")
KERNEL_DEF(iso_SmoothNormalsNormalizeSparse, "iso_SmoothNormalsNormalizeSparse")
KERNEL_DEF(iso_ComputeNormals, "iso_ComputeNormals")
KERNEL_DEF(iso_ComputeNormalsSparse, "iso_ComputeNormalsSparse")
KERNEL_DEF(iso_NormalizeNormals, "iso_NormalizeNormals")
KERNEL_DEF(iso_NormalizeNormalsSparse, "iso_NormalizeNormalsSparse")
KERNEL_DEF(iso_GridFilterGauss, "iso_GridFilterGauss")
KERNEL_DEF(iso_GridFilterGaussSparse, "iso_GridFilterGaussSparse")
KERNEL_DEF(iso_GridFilterDilateErode, "iso_GridFilterDilateErode")
KERNEL_DEF(iso_GridFilterDilateErodeSparse, "iso_GridFilterDilateErodeSparse")
KERNEL_DEF(sg_SparseGridCalcSubgridHashes, "sg_SparseGridCalcSubgridHashes")
KERNEL_DEF(sg_SparseGridMarkRequiredNeighbors, "sg_SparseGridMarkRequiredNeighbors")
KERNEL_DEF(sg_SparseGridSortedArrayToDelta, "sg_SparseGridSortedArrayToDelta")
KERNEL_DEF(sg_SparseGridGetUniqueValues, "sg_SparseGridGetUniqueValues")
KERNEL_DEF(sg_SparseGridClearDensity, "sg_SparseGridClearDensity")
KERNEL_DEF(sg_SparseGridBuildSubgridNeighbors, "sg_SparseGridBuildSubgridNeighbors")
KERNEL_DEF(sg_MarkSubgridEndIndices, "sg_MarkSubgridEndIndices")
KERNEL_DEF(sg_ReuseSubgrids, "sg_ReuseSubgrids")
KERNEL_DEF(sg_AddReleasedSubgridsToUnusedStack, "sg_AddReleasedSubgridsToUnusedStack")
KERNEL_DEF(sg_AllocateNewSubgrids, "sg_AllocateNewSubgrids")
KERNEL_DEF(util_InterleaveBuffers, "interleaveBuffers")
KERNEL_DEF(util_InterpolateSkinnedClothVertices, "interpolateSkinnedClothVertices")
KERNEL_DEF(util_InterpolateSkinnedSoftBodyVertices, "interpolateSkinnedSoftBodyVertices")
KERNEL_DEF(util_ComputeNormals, "normalVectorsAreaWeighted")
KERNEL_DEF(util_NormalizeNormals, "normalizeNormals")
KERNEL_DEF(util_ZeroNormals, "zeroNormals")
//BVH construction kernels
KERNEL_DEF(bvh_ComputeTriangleBounds, "bvhComputeTriangleBounds")
KERNEL_DEF(bvh_ComputeTotalBounds, "bvhComputeTotalBounds")
KERNEL_DEF(bvh_ComputeTotalInvEdges, "bvhComputeTotalInvEdges")
KERNEL_DEF(bvh_CalculateMortonCodes, "bvhCalculateMortonCodes")
KERNEL_DEF(bvh_CalculateKeyDeltas, "bvhCalculateKeyDeltas")
KERNEL_DEF(bvh_CalculateKeyDeltasSquaredDistance, "bvhCalculateKeyDeltasSquaredDistance")
KERNEL_DEF(bvh_BuildLeaves, "bvhBuildLeaves")
KERNEL_DEF(bvh_BuildHierarchy, "bvhBuildHierarchy")
KERNEL_DEF(bvh_BuildHierarchyAndWindingClusters, "bvhBuildHierarchyAndWindingClusters")
//SDF Construction kernels
KERNEL_DEF(sdf_CalculateDenseGridBlocks, "sdfCalculateDenseGridBlocks")
KERNEL_DEF(sdf_CalculateDenseGridHybrid, "sdfCalculateDenseGridHybrid")
KERNEL_DEF(sdf_PopulateBackgroundSDF, "sdfPopulateBackgroundSDF")
KERNEL_DEF(sdf_MarkRequiredSdfSubgrids, "sdfMarkRequiredSdfSubgrids")
KERNEL_DEF(sdf_PopulateSdfSubgrids, "sdfPopulateSdfSubgrids")
KERNEL_DEF(sdf_CountHoles, "sdfCountHoles")
KERNEL_DEF(sdf_FindHoles, "sdfFindHoles")
KERNEL_DEF(sdf_ApplyHoleCorrections, "sdfApplyHoleCorrections")
KERNEL_DEF(sdf_CalculateDenseGridPointCloud, "sdfCalculateDenseGridPointCloud")
#endif

View File

@@ -0,0 +1,51 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef PXG_KERNEL_WRANGLER_H
#define PXG_KERNEL_WRANGLER_H
#include "foundation/PxPreprocessor.h"
#include "PxsKernelWrangler.h"
#include "foundation/PxArray.h"
namespace physx
{
class PxCudaContextManager;
class KernelWrangler;
class PxErrorCallback;
class PxgCudaKernelWranglerManager : public PxsKernelWranglerManager
{
public:
PxgCudaKernelWranglerManager(PxCudaContextManager& cudaContextManager, PxErrorCallback& errorCallback);
~PxgCudaKernelWranglerManager();
};
}
#endif

View File

@@ -0,0 +1,70 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
#ifndef PXG_MEM_COPY_DISPATCHER
#define PXG_MEM_COPY_DISPATCHER
#include "foundation/PxUserAllocated.h"
#include "foundation/PxPinnedArray.h"
#include "PxgCudaBuffer.h"
namespace physx
{
class PxCudaContext;
class KernelWrangler;
struct PxgPtrPair
{
void* src;
void* dst;
size_t size;
};
class PxgMemCopyDispatcher : public PxUserAllocated
{
PxPinnedArray<PxgPtrPair> mPinnedCopyBuffer;
PxgCudaBuffer mDeviceCopyCommands;
size_t mMaxSize;
public:
PxgMemCopyDispatcher(PxgHeapMemoryAllocatorManager* gpuHeapAllocator, PxVirtualAllocatorCallback* hostAllocator) :
mPinnedCopyBuffer(PxVirtualAllocator(hostAllocator)), mDeviceCopyCommands(gpuHeapAllocator, PxsHeapStats::eOTHER),
mMaxSize(0)
{
}
void addCommand(PxgPtrPair& command)
{
mPinnedCopyBuffer.pushBack(command);
mMaxSize = PxMax(mMaxSize, command.size);
}
void flushCommands(CUstream stream, PxCudaContext* cudaContext, KernelWrangler* kernelWrangler);
};
}
#endif

View File

@@ -0,0 +1,54 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef PXG_MEMORY_ALLOCATOR_H
#define PXG_MEMORY_ALLOCATOR_H
#include "PxsMemoryManager.h"
#include "foundation/PxArray.h"
namespace physx
{
class PxCudaContextManager;
class PxCudaContext;
// PT: this is for GPU, see createDefaultMemoryManager for CPU
PxsMemoryManager* createPxgMemoryManager(PxCudaContextManager* cudaContextManager);
class PxgCudaAllocatorCallbackBase : public PxVirtualAllocatorCallback, public PxUserAllocated
{
public:
PxgCudaAllocatorCallbackBase(PxCudaContextManager* contextManager);
virtual ~PxgCudaAllocatorCallbackBase() {}
PxCudaContextManager* mContextManager;
PxCudaContext* mCudaContext;
};
}
#endif

View File

@@ -0,0 +1,57 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef PXG_RADIXSORT_DESC_H
#define PXG_RADIXSORT_DESC_H
#include "foundation/PxSimpleTypes.h"
#define NUM_RADIX_SORT_DESC 2
namespace physx
{
struct PxgRadixSortDesc
{
PxU32* inputKeys;
PxU32* inputRanks;
PxU32* outputKeys;
PxU32* outputRanks;
PxU32* radixBlockCounts; //store the each radix's total number different different blocks
PxU32 count;
};
struct PxgRadixSortBlockDesc : public PxgRadixSortDesc
{
public:
PxU32* numKeys;
};
}
#endif

View File

@@ -0,0 +1,53 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef PXG_RADIX_SORT_KERNEL_INDICES_H
#define PXG_RADIX_SORT_KERNEL_INDICES_H
namespace physx
{
struct PxgRadixSortKernelBlockDim
{
enum
{
RADIX_SORT = 1024,
};
};
struct PxgRadixSortKernelGridDim
{
enum
{
RADIX_SORT = 32,
};
};
}
#endif

View File

@@ -0,0 +1,796 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
/*
This file implements common mathematical operations on vector types
(float3, float4 etc.) since these are not provided as standard by CUDA.
The syntax is modelled on the Cg standard library.
*/
#ifndef CUTIL_MATH_H
#define CUTIL_MATH_H
#include "foundation/PxPreprocessor.h"
#if PX_LINUX && PX_CLANG
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wdocumentation"
#endif
#include <vector_types.h>
#include "vector_functions.h" // why does this pull in the runtime api?
#if PX_LINUX && PX_CLANG
#pragma clang diagnostic pop
#endif
////////////////////////////////////////////////////////////////////////////////
typedef unsigned int uint;
typedef unsigned short ushort;
typedef unsigned char uchar;
// float functions
////////////////////////////////////////////////////////////////////////////////
#if !PX_CUDA_COMPILER
#ifndef PX_MIN_MAX
#define PX_MIN_MAX
#if PX_VC
PX_FORCE_INLINE __device__ __host__ float fmaxf(float a, float b) { return a > b ? a : b; }
PX_FORCE_INLINE __device__ __host__ float fminf(float a, float b) { return a < b ? a : b; }
#endif
PX_FORCE_INLINE __device__ __host__ int max(int a, int b) { return a > b ? a : b; }
PX_FORCE_INLINE __device__ __host__ unsigned int max(unsigned int a, unsigned int b) { return a > b ? a : b; }
PX_FORCE_INLINE __device__ __host__ int min(int a, int b) { return a < b ? a : b; }
PX_FORCE_INLINE __device__ __host__ unsigned int min(unsigned int a, unsigned int b) { return a < b ? a : b; }
#endif
#endif
// lerp
inline __device__ __host__ float lerp(float a, float b, float t)
{
return a + t*(b-a);
}
template <typename T>
__device__ inline T bilerp(const T f00, const T f10, const T f01, const T f11, const float tx, const float ty)
{
return lerp(lerp(f00, f10, tx), lerp(f01, f11, tx), ty);
}
template <typename T>
__device__ inline T trilerp(const T f000, const T f100, const T f010, const T f110, const T f001, const T f101, const T f011, const T f111,
const float tx, const float ty, const float tz)
{
return lerp(bilerp(f000, f100, f010, f110, tx, ty), bilerp(f001, f101, f011, f111, tx, ty), tz);
}
// clamp
inline __device__ __host__ float clamp(float f, float a, float b)
{
return fmaxf(a, fminf(f, b));
}
// int2 functions
////////////////////////////////////////////////////////////////////////////////
// addition
inline __host__ __device__ int2 operator+(int2 a, int2 b)
{
return make_int2(a.x + b.x, a.y + b.y);
}
inline __host__ __device__ void operator+=(int2 &a, int2 b)
{
a.x += b.x; a.y += b.y;
}
// subtract
inline __host__ __device__ int2 operator-(int2 a, int2 b)
{
return make_int2(a.x - b.x, a.y - b.y);
}
inline __host__ __device__ void operator-=(int2 &a, int2 b)
{
a.x -= b.x; a.y -= b.y;
}
// multiply
inline __host__ __device__ int2 operator*(int2 a, int2 b)
{
return make_int2(a.x * b.x, a.y * b.y);
}
inline __host__ __device__ int2 operator*(int2 a, int s)
{
return make_int2(a.x * s, a.y * s);
}
inline __host__ __device__ int2 operator*(int s, int2 a)
{
return make_int2(a.x * s, a.y * s);
}
inline __host__ __device__ void operator*=(int2 &a, int s)
{
a.x *= s; a.y *= s;
}
// float2 functions
////////////////////////////////////////////////////////////////////////////////
// additional constructors
inline __host__ __device__ float2 make_float2(float s)
{
return make_float2(s, s);
}
inline __host__ __device__ float2 make_float2(int2 a)
{
return make_float2(float(a.x), float(a.y));
}
// addition
inline __host__ __device__ float2 operator+(float2 a, float2 b)
{
return make_float2(a.x + b.x, a.y + b.y);
}
inline __host__ __device__ void operator+=(float2 &a, float2 b)
{
a.x += b.x; a.y += b.y;
}
// subtract
inline __host__ __device__ float2 operator-(float2 a, float2 b)
{
return make_float2(a.x - b.x, a.y - b.y);
}
inline __host__ __device__ void operator-=(float2 &a, float2 b)
{
a.x -= b.x; a.y -= b.y;
}
// multiply
inline __host__ __device__ float2 operator*(float2 a, float2 b)
{
return make_float2(a.x * b.x, a.y * b.y);
}
inline __host__ __device__ float2 operator*(float2 a, float s)
{
return make_float2(a.x * s, a.y * s);
}
inline __host__ __device__ float2 operator*(float s, float2 a)
{
return make_float2(a.x * s, a.y * s);
}
inline __host__ __device__ void operator*=(float2 &a, float s)
{
a.x *= s; a.y *= s;
}
// divide
inline __host__ __device__ float2 operator/(float2 a, float2 b)
{
return make_float2(a.x / b.x, a.y / b.y);
}
inline __host__ __device__ float2 operator/(float2 a, float s)
{
float inv = 1.0f / s;
return a * inv;
}
inline __host__ __device__ float2 operator/(float s, float2 a)
{
float inv = 1.0f / s;
return a * inv;
}
inline __host__ __device__ void operator/=(float2 &a, float s)
{
float inv = 1.0f / s;
a *= inv;
}
// lerp
inline __device__ __host__ float2 lerp(float2 a, float2 b, float t)
{
return a + t*(b-a);
}
// clamp
inline __device__ __host__ float2 clamp(float2 v, float a, float b)
{
return make_float2(clamp(v.x, a, b), clamp(v.y, a, b));
}
inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b)
{
return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
}
// dot product
inline __host__ __device__ float dot(float2 a, float2 b)
{
return a.x * b.x + a.y * b.y;
}
// length
inline __host__ __device__ float length(float2 v)
{
return sqrtf(dot(v, v));
}
// normalize
inline __host__ __device__ float2 normalize(float2 v)
{
float invLen = 1.0f / sqrtf(dot(v, v));
return v * invLen;
}
// floor
inline __host__ __device__ float2 floor(const float2 v)
{
return make_float2(floorf(v.x), floorf(v.y));
}
//validate
inline __host__ __device__ bool validate(const float2 v0, const float2 v1)
{
return (v0.x == v1.x && v0.y == v1.y);
}
// float3 functions
////////////////////////////////////////////////////////////////////////////////
// additional constructors
inline __host__ __device__ float3 make_float3(float s)
{
return make_float3(s, s, s);
}
inline __host__ __device__ float3 make_float3(float2 a)
{
return make_float3(a.x, a.y, 0.0f);
}
inline __host__ __device__ float3 make_float3(float2 a, float s)
{
return make_float3(a.x, a.y, s);
}
inline __host__ __device__ float3 make_float3(float4 a)
{
return make_float3(a.x, a.y, a.z); // discards w
}
inline __host__ __device__ float3 make_float3(int3 a)
{
return make_float3(float(a.x), float(a.y), float(a.z));
}
// min
static __inline__ __host__ __device__ float3 fminf(float3 a, float3 b)
{
return make_float3(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z));
}
// max
static __inline__ __host__ __device__ float3 fmaxf(float3 a, float3 b)
{
return make_float3(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z));
}
// addition
inline __host__ __device__ float3 operator+(float3 a, float3 b)
{
return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
}
inline __host__ __device__ void operator+=(float3 &a, float3 b)
{
a.x += b.x; a.y += b.y; a.z += b.z;
}
// subtract
inline __host__ __device__ float3 operator-(float3 a, float3 b)
{
return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
}
inline __host__ __device__ void operator-=(float3 &a, float3 b)
{
a.x -= b.x; a.y -= b.y; a.z -= b.z;
}
// multiply
inline __host__ __device__ float3 operator*(float3 a, float3 b)
{
return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
}
inline __host__ __device__ float3 operator*(float3 a, float s)
{
return make_float3(a.x * s, a.y * s, a.z * s);
}
inline __host__ __device__ float3 operator*(float s, float3 a)
{
return make_float3(a.x * s, a.y * s, a.z * s);
}
inline __host__ __device__ void operator*=(float3 &a, float s)
{
a.x *= s; a.y *= s; a.z *= s;
}
// divide
inline __host__ __device__ float3 operator/(float3 a, float3 b)
{
return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
}
inline __host__ __device__ float3 operator/(float3 a, float s)
{
float inv = 1.0f / s;
return a * inv;
}
inline __host__ __device__ float3 operator/(float s, float3 a)
{
float inv = 1.0f / s;
return a * inv;
}
inline __host__ __device__ void operator/=(float3 &a, float s)
{
float inv = 1.0f / s;
a *= inv;
}
// lerp
inline __device__ __host__ float3 lerp(float3 a, float3 b, float t)
{
return a + t*(b-a);
}
// clamp
inline __device__ __host__ float3 clamp(float3 v, float a, float b)
{
return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
}
inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
{
return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
}
// dot product
inline __host__ __device__ float dot(float3 a, float3 b)
{
return a.x * b.x + a.y * b.y + a.z * b.z;
}
// cross product
inline __host__ __device__ float3 cross(float3 a, float3 b)
{
return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
}
// length
inline __host__ __device__ float length(float3 v)
{
return sqrtf(dot(v, v));
}
// normalize
inline __host__ __device__ float3 normalize(float3 v)
{
float invLen = 1.0f / sqrtf(dot(v, v));
return v * invLen;
}
// floor
inline __host__ __device__ float3 floor(const float3 v)
{
return make_float3(floorf(v.x), floorf(v.y), floorf(v.z));
}
//validate
inline __host__ __device__ bool validate(const float3 v0, const float3 v1)
{
return (v0.x == v1.x && v0.y == v1.y && v0.z == v1.z);
}
// float4 functions
////////////////////////////////////////////////////////////////////////////////
// additional constructors
inline __host__ __device__ float4 make_float4(float s)
{
return make_float4(s, s, s, s);
}
inline __host__ __device__ float4 make_float4(float3 a)
{
return make_float4(a.x, a.y, a.z, 0.0f);
}
inline __host__ __device__ float4 make_float4(float3 a, float w)
{
return make_float4(a.x, a.y, a.z, w);
}
inline __host__ __device__ float4 make_float4(int4 a)
{
return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
}
// min
static __inline__ __host__ __device__ float4 fminf(float4 a, float4 b)
{
return make_float4(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z), fminf(a.w,b.w));
}
// max
static __inline__ __host__ __device__ float4 fmaxf(float4 a, float4 b)
{
return make_float4(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z), fmaxf(a.w,b.w));
}
// addition
inline __host__ __device__ float4 operator+(float4 a, float4 b)
{
return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
}
inline __host__ __device__ void operator+=(float4 &a, float4 b)
{
a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
}
// subtract
inline __host__ __device__ float4 operator-(float4 a, float4 b)
{
return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
}
inline __host__ __device__ void operator-=(float4 &a, float4 b)
{
a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
}
// multiply
inline __host__ __device__ float4 operator*(float4 a, float s)
{
return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
}
inline __host__ __device__ float4 operator*(float s, float4 a)
{
return make_float4(a.x * s, a.y * s, a.z * s, a.w * s);
}
inline __host__ __device__ void operator*=(float4 &a, float s)
{
a.x *= s; a.y *= s; a.z *= s; a.w *= s;
}
// divide
inline __host__ __device__ float4 operator/(float4 a, float4 b)
{
return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
}
inline __host__ __device__ float4 operator/(float4 a, float s)
{
float inv = 1.0f / s;
return a * inv;
}
inline __host__ __device__ float4 operator/(float s, float4 a)
{
float inv = 1.0f / s;
return a * inv;
}
inline __host__ __device__ void operator/=(float4 &a, float s)
{
float inv = 1.0f / s;
a *= inv;
}
// lerp
inline __device__ __host__ float4 lerp(float4 a, float4 b, float t)
{
return a + t*(b-a);
}
// clamp
inline __device__ __host__ float4 clamp(float4 v, float a, float b)
{
return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
}
inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b)
{
return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
}
// dot product
inline __host__ __device__ float dot(float4 a, float4 b)
{
return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
}
// length
inline __host__ __device__ float length(float4 r)
{
return sqrtf(dot(r, r));
}
// normalize
inline __host__ __device__ float4 normalize(float4 v)
{
float invLen = 1.0f / sqrtf(dot(v, v));
return v * invLen;
}
// floor
inline __host__ __device__ float4 floor(const float4 v)
{
return make_float4(floorf(v.x), floorf(v.y), floorf(v.z), floorf(v.w));
}
//validate
inline __host__ __device__ bool validate(const float4 v0, const float4 v1)
{
return (v0.x == v1.x && v0.y == v1.y && v0.z == v1.z && v0.w == v1.w);
}
// int3 functions
////////////////////////////////////////////////////////////////////////////////
// additional constructors
inline __host__ __device__ int3 make_int3(int s)
{
return make_int3(s, s, s);
}
inline __host__ __device__ int3 make_int3(float3 a)
{
return make_int3(int(a.x), int(a.y), int(a.z));
}
// min
inline __host__ __device__ int3 min(int3 a, int3 b)
{
return make_int3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
}
// max
inline __host__ __device__ int3 max(int3 a, int3 b)
{
return make_int3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
}
// addition
inline __host__ __device__ int3 operator+(int3 a, int3 b)
{
return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
}
inline __host__ __device__ void operator+=(int3 &a, int3 b)
{
a.x += b.x; a.y += b.y; a.z += b.z;
}
// subtract
inline __host__ __device__ int3 operator-(int3 a, int3 b)
{
return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
}
inline __host__ __device__ void operator-=(int3 &a, int3 b)
{
a.x -= b.x; a.y -= b.y; a.z -= b.z;
}
// multiply
inline __host__ __device__ int3 operator*(int3 a, int3 b)
{
return make_int3(a.x * b.x, a.y * b.y, a.z * b.z);
}
inline __host__ __device__ int3 operator*(int3 a, int s)
{
return make_int3(a.x * s, a.y * s, a.z * s);
}
inline __host__ __device__ int3 operator*(int s, int3 a)
{
return make_int3(a.x * s, a.y * s, a.z * s);
}
inline __host__ __device__ void operator*=(int3 &a, int s)
{
a.x *= s; a.y *= s; a.z *= s;
}
// divide
inline __host__ __device__ int3 operator/(int3 a, int3 b)
{
return make_int3(a.x / b.x, a.y / b.y, a.z / b.z);
}
inline __host__ __device__ int3 operator/(int3 a, int s)
{
return make_int3(a.x / s, a.y / s, a.z / s);
}
inline __host__ __device__ int3 operator/(int s, int3 a)
{
return make_int3(a.x / s, a.y / s, a.z / s);
}
inline __host__ __device__ void operator/=(int3 &a, int s)
{
a.x /= s; a.y /= s; a.z /= s;
}
// clamp
inline __device__ __host__ int clamp(int f, int a, int b)
{
return max(a, min(f, b));
}
inline __device__ __host__ int3 clamp(int3 v, int a, int b)
{
return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
}
inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b)
{
return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
}
//validate
inline __host__ __device__ bool validate(const int3 v0, const int3 v1)
{
return (v0.x == v1.x && v0.y == v1.y && v0.z == v1.z);
}
// uint3 functions
////////////////////////////////////////////////////////////////////////////////
// additional constructors
inline __host__ __device__ uint3 make_uint3(uint s)
{
return make_uint3(s, s, s);
}
inline __host__ __device__ uint4 make_uint4(uint s)
{
return make_uint4(s, s, s, s);
}
inline __host__ __device__ uint3 make_uint3(float3 a)
{
return make_uint3(uint(a.x), uint(a.y), uint(a.z));
}
// min
inline __host__ __device__ uint3 min(uint3 a, uint3 b)
{
return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
}
// max
inline __host__ __device__ uint3 max(uint3 a, uint3 b)
{
return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
}
// addition
inline __host__ __device__ uint3 operator+(uint3 a, uint3 b)
{
return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z);
}
inline __host__ __device__ void operator+=(uint3 &a, uint3 b)
{
a.x += b.x; a.y += b.y; a.z += b.z;
}
// subtract
inline __host__ __device__ uint3 operator-(uint3 a, uint3 b)
{
return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z);
}
inline __host__ __device__ void operator-=(uint3 &a, uint3 b)
{
a.x -= b.x; a.y -= b.y; a.z -= b.z;
}
// multiply
inline __host__ __device__ uint3 operator*(uint3 a, uint3 b)
{
return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z);
}
inline __host__ __device__ uint3 operator*(uint3 a, uint s)
{
return make_uint3(a.x * s, a.y * s, a.z * s);
}
inline __host__ __device__ uint3 operator*(uint s, uint3 a)
{
return make_uint3(a.x * s, a.y * s, a.z * s);
}
inline __host__ __device__ void operator*=(uint3 &a, uint s)
{
a.x *= s; a.y *= s; a.z *= s;
}
// divide
inline __host__ __device__ uint3 operator/(uint3 a, uint3 b)
{
return make_uint3(a.x / b.x, a.y / b.y, a.z / b.z);
}
inline __host__ __device__ uint3 operator/(uint3 a, uint s)
{
return make_uint3(a.x / s, a.y / s, a.z / s);
}
inline __host__ __device__ uint3 operator/(uint s, uint3 a)
{
return make_uint3(a.x / s, a.y / s, a.z / s);
}
inline __host__ __device__ void operator/=(uint3 &a, uint s)
{
a.x /= s; a.y /= s; a.z /= s;
}
// clamp
inline __device__ __host__ uint clamp(uint f, uint a, uint b)
{
return max(a, min(f, b));
}
inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b)
{
return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
}
inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b)
{
return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
}
//validate
inline __host__ __device__ bool validate(const uint3 v0, const uint3 v1)
{
return (v0.x == v1.x && v0.y == v1.y && v0.z == v1.z);
}
//other
inline __device__ __host__ ushort u32High(uint val)
{
return ushort(val >> 16);
}
inline __device__ __host__ ushort u32Low(uint val)
{
return ushort(val & ((1 << 16u) - 1));
}
inline __device__ __host__ uint merge(ushort hi, ushort lo)
{
return uint((uint(hi) << 16) | uint(lo));
}
inline __device__ __host__ uchar u16High(ushort val)
{
return uchar(val >> 8);
}
inline __device__ __host__ uchar u16Low(ushort val)
{
return uchar(val & ((1 << 8u) - 1));
}
inline __device__ __host__ ushort merge(uchar hi, uchar lo)
{
return ushort((ushort(hi) << 8) | ushort(lo));
}
#endif

View File

@@ -0,0 +1,75 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef MATHS_EXTENSIONS_H
#define MATHS_EXTENSIONS_H
#include "cutil_math.h"
#include "foundation/PxSimpleTypes.h"
#include "foundation/PxQuat.h"
#include "foundation/PxVec3.h"
namespace physx
{
PX_CUDA_CALLABLE PX_FORCE_INLINE PxReal dot3(const float4& x, const float4& y)
{
return x.x * y.x + x.y * y.y + x.z * y.z;
}
PX_CUDA_CALLABLE PX_FORCE_INLINE float4 rotate(const PxQuat& r, const float4& v)
{
const PxF32 vx = 2.0f*v.x;
const PxF32 vy = 2.0f*v.y;
const PxF32 vz = 2.0f*v.z;
const PxF32 w2 = r.w*r.w-0.5f;
const PxF32 dot2 = (r.x*vx + r.y*vy +r.z*vz);
return make_float4
(
(vx*w2 + (r.y * vz - r.z * vy)*r.w + r.x*dot2),
(vy*w2 + (r.z * vx - r.x * vz)*r.w + r.y*dot2),
(vz*w2 + (r.x * vy - r.y * vx)*r.w + r.z*dot2),
0.f
);
}
PX_CUDA_CALLABLE PX_FORCE_INLINE float4 cross3(const float4& v0, const float4& v2)
{
return make_float4(v0.y * v2.z - v0.z * v2.y,
v0.z * v2.x - v0.x * v2.z,
v0.x * v2.y - v0.y * v2.x,
0.f);
}
PX_CUDA_CALLABLE PX_FORCE_INLINE float4 operator - (const float4& v)
{
return make_float4(-v.x, -v.y, -v.z, -v.w);
}
}
#endif

View File

@@ -0,0 +1,106 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
#ifndef PXG_MEM_COPY_BALANCED_CU
#define PXG_MEM_COPY_BALANCED_CU
#include "foundation/PxMath.h"
#include <assert.h>
#include <stdio.h>
#include "PxgCopyManager.h"
#include "PxgCommonDefines.h"
using namespace physx;
extern "C" __host__ void initCommonKernels0() {}
template<PxU32 warpsPerBlock>
__device__ void copyBalanced(
PxgCopyManager::CopyDesc* PX_RESTRICT desc, /* Input */
PxU32 count /* Input */
)
{
__shared__ PxgCopyManager::CopyDesc copyDesc[warpsPerBlock];
if (blockIdx.x < count)
{
const PxU32 idxInWarp = threadIdx.x;
const PxU32 warpIdxInBlock = threadIdx.y;
if (idxInWarp == 0)
{
PxgCopyManager::CopyDesc d = desc[blockIdx.x];
copyDesc[warpIdxInBlock] = d;
}
__syncwarp();
PxU32* srcPtr = reinterpret_cast<PxU32*>(copyDesc[warpIdxInBlock].source);
PxU32* dstPtr = reinterpret_cast<PxU32*>(copyDesc[warpIdxInBlock].dest);
PxU32 size = copyDesc[warpIdxInBlock].bytes / 4; //Size is in bytes, we're reading words...
PxU32 groupThreadIdx = threadIdx.x + threadIdx.y * WARP_SIZE;
for (PxU32 a = groupThreadIdx; a < size; a += WARP_SIZE * warpsPerBlock)
{
PxU32 sourceVal = srcPtr[a];
dstPtr[a] = sourceVal;
}
}
}
extern "C"
__global__
void MemCopyBalanced(
PxgCopyManager::CopyDesc* PX_RESTRICT desc,
PxU32 count
)
{
copyBalanced<COPY_KERNEL_WARPS_PER_BLOCK>(
desc,
count
);
}
extern "C" __global__ void clampMaxValue(PxU32* value, const PxU32 maxValue)
{
if(*value > maxValue)
*value = maxValue;
}
// temporary clamping function for contact counts: will be generalized in the future.
extern "C" __global__ void clampMaxValues(PxU32* value0, PxU32* value1, PxU32* value2, const PxU32 maxValue)
{
if (*value0 > maxValue)
*value0 = maxValue;
if (*value1 > maxValue)
*value1 = maxValue;
if (*value2 > maxValue)
*value2 = maxValue;
}
#endif

View File

@@ -0,0 +1,128 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef __CU_MEMORY_ALLOCATOR_CUH__
#define __CU_MEMORY_ALLOCATOR_CUH__
#include "cutil_math.h"
#include "stdio.h"
class ScratchMemoryAllocator
{
public:
__device__ ScratchMemoryAllocator(uchar* mem, uint allocatedSize) : startPtr(mem),
totalAllocatedSize(allocatedSize), currentSize(0)
{
}
template <typename T>
__device__ T* alloc(uint requestedSize)
{
T* ptr = reinterpret_cast<T*>(startPtr + currentSize);
if (totalAllocatedSize < (currentSize + requestedSize))
{
printf("alloc out of sharedMemory !\n");
return NULL;
}
currentSize += requestedSize;
return ptr;
}
template <typename T>
__device__ T* allocAligned(uint requestedSize, size_t alignment = 4)
{
size_t baseAddress = size_t(startPtr + currentSize);
size_t alignedAddress = (baseAddress + size_t(alignment - 1)) & (~(size_t(alignment-1)));
uint paddingBytes = uint(alignedAddress - baseAddress);
const uint newRequestedSize = requestedSize + paddingBytes;
if (totalAllocatedSize < (currentSize + newRequestedSize))
{
#if 1
printf("allocAligned out of sharedMemory allocating %i bytes!\n", requestedSize);
#endif
return NULL;
}
currentSize += newRequestedSize;
T* ptr = reinterpret_cast<T*>(alignedAddress);
return ptr;
}
uchar* startPtr;
uint totalAllocatedSize;
uint currentSize;
};
class ScratchMemoryMarker
{
uint currentSize;
ScratchMemoryAllocator& alloc;
public:
__device__ ScratchMemoryMarker(ScratchMemoryAllocator& allocator) : alloc(allocator)
{
currentSize = alloc.currentSize;
}
__device__ ~ScratchMemoryMarker()
{
alloc.currentSize = currentSize;
}
__device__ void reset()
{
alloc.currentSize = currentSize;
}
};
template <typename Type, int SharedCapacity, int Capacity>
class HybridSharedArray
{
Type* sharedBuffer;
Type locBuff[Capacity - SharedCapacity];
public:
HybridSharedArray(Type* shBuff) : sharedBuffer(shBuff)
{
}
PX_FORCE_INLINE const Type& operator[] (const uint index) const { return index < SharedCapacity ? sharedBuffer[index] : locBuff[index-SharedCapacity];}
PX_FORCE_INLINE Type& operator[] (const uint index) { return index < SharedCapacity ? sharedBuffer[index] : locBuff[index - SharedCapacity]; }
};
#endif

View File

@@ -0,0 +1,762 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
#ifndef __CU_RADIX_SORT_CUH__
#define __CU_RADIX_SORT_CUH__
#include "foundation/PxPreprocessor.h"
#include "vector_types.h"
#include "PxgRadixSortDesc.h"
#include "PxgRadixSortKernelIndices.h"
#include "PxgCommonDefines.h"
#include "stdio.h"
#include "reduction.cuh"
#include "foundation/PxMath.h"
#include <assert.h>
using namespace physx;
#define RADIX_SIZE 16
#define RADIX_ACCUM_SIZE 8
static __device__ uint4 getRadix(uint4 input, const PxU32 startBit)
{
uint4 radix;
radix.x = (input.x >> startBit) & 0xF;
radix.y = (input.y >> startBit) & 0xF;
radix.z = (input.z >> startBit) & 0xF;
radix.w = (input.w >> startBit) & 0xF;
return radix;
//return (input >> startBit) & 0xF;
}
static __device__ PxU32 getRadix(PxU32 input, const PxU32 startBit)
{
return (input >> startBit) & 0xF;
}
//accumulated each individual warps to a responding radix
//radix0[0, 7] ==> radixSum[7], radix1[8, 15] ==> radixSum[15], radix2[16, 23] ==>radixSum[23], radix3[24, 31] ==> radixSum[31],
//radix4[32, 39] ==> radixSum[39], radix5[40, 47] ==>radixSum[47], radix6[48, 55] ==>radixSum[55], radix7[56, 63] ==>radixSum[63]
//
template <PxU32 WARP_PERBLOCK_SIZE>
static __device__ PxU32 scanRadixWarps(const PxU32 threadIndexInWarp, PxU32* radixSum, const PxU32 originalVal, const PxU32 value)
{
const PxU32 idx = threadIdx.x;
const PxU32 radixIndex = threadIndexInWarp & (WARP_PERBLOCK_SIZE-1);
int val = originalVal;
for(PxU32 a = 1; a < WARP_PERBLOCK_SIZE; a*=2)
{
int temp = __shfl_sync(FULL_MASK, val, idx-a);
if(radixIndex >= a)
{
val += temp;
}
}
return val - value;
}
template <PxU32 WARP_PERBLOCK_SIZE>
static __device__ void scanRadixes(const PxU32 warpIndexInBlock, const PxU32 threadIndexInWarp, PxU32* PX_RESTRICT gData, PxU32* PX_RESTRICT sData, PxU32* PX_RESTRICT accumulateBuffer)
{
for(PxU32 i=warpIndexInBlock; i<RADIX_SIZE; i+=WARP_PERBLOCK_SIZE)
{
const PxU32 radixSumIndex = i*gridDim.x + threadIndexInWarp;
const PxU32 value = gData[radixSumIndex];
PxU32 output = warpScanAddWriteToSharedMem<WARP_SIZE>(FULL_MASK, radixSumIndex, threadIndexInWarp, sData, value, value);
if(threadIndexInWarp == (WARP_SIZE-1))
accumulateBuffer[i] = output + value;
}
}
//there are 256 threads in a block and therefore 8 warp in a block
__device__ inline PxU32 sanitizeKeys(uint4& keyValue, PxU32 id, const PxU32 count)
{
PxU32 goodVals = count - id;
if (goodVals < 4)
{
PxU32 badVals = 4 - goodVals;
switch (badVals)
{
case 3:
keyValue.y = 0xFFffFFff;
case 2:
keyValue.z = 0xFFffFFff;
case 1:
keyValue.w = 0xFFffFFff;
}
}
return goodVals;
}
struct RadixAccum
{
PxU32 radixAccum[RADIX_ACCUM_SIZE];
};
__device__ inline void radixSortWarp(const uint4* PX_RESTRICT gInputKeys, const PxU32 idx, const PxU32 count, const PxU32 stride,
const PxU32 startBit, const PxU32 startIdx, const PxU32 totalCount, RadixAccum& radixAccum)
{
//HISTOGRAM-KEYS && SCAN-BUCKET
uint4 keyValue;
uint4 radix;
for(PxU32 i = idx; i < count; i += stride)
{
const PxU32 gInputIdx = i + startIdx;
keyValue = gInputKeys[gInputIdx];
const PxU32 nbVals = sanitizeKeys(keyValue, gInputIdx * 4, totalCount);
radix = getRadix(keyValue, startBit);
//each thread read 4 elements. We store the each element's radix[0, 15] into a local array. The code should be
//radixAccum[radix.x]++; radixAccum[radix.y]++; radixAccum[radix.z]++; radixAccum[radix.w]++;
//However, in order to save register, each radixAccum can store 2 radix's accumulation result. Therefore, radixAccum is
//half of the size of RADIX_SIZE and each radix has 16 bits in the radixAccum.
//The for loop is used to trick the compiler to keep radixAccum array in registers
#pragma unroll
for (PxU32 bit = 0; bit < 16; bit += 2)
{
PxU32 accum = (1u << ((radix.x - bit) << 4));
accum += (1u << ((radix.y - bit) << 4));
accum += (1u << ((radix.z - bit) << 4));
accum += (1u << ((radix.w - bit) << 4));
radixAccum.radixAccum[bit / 2] += accum;
}
}
}
template <PxU32 WARP_PERBLOCK_SIZE>
__device__ inline void radixSortSingleBlock(const uint4* PX_RESTRICT gInputKeys, const uint4* PX_RESTRICT gInputRanks, const PxU32 gNumKeys, const PxU32 startBit, PxU32* gRadixCount)
{
const PxU32 nbBlocks = PxgRadixSortKernelGridDim::RADIX_SORT;
PX_COMPILE_TIME_ASSERT(nbBlocks == 32);
__shared__ PxU32 sRadixSum[RADIX_SIZE*WARP_PERBLOCK_SIZE];
//the number of inputKeys is random. However, this algorithm sort inputKeys at a time, so that we need to initialize the number of keys properly
const PxU32 numKeys = (gNumKeys+3)/4;
const PxU32 totalBlockRequired = (numKeys + (blockDim.x-1))/ blockDim.x;
const PxU32 numIterationPerBlock = (totalBlockRequired + (nbBlocks-1))/ nbBlocks;
//This identifies which warp a specific thread is in, we treat all warps in all blocks as a flatten warp array
//and we are going to index the work based on that
//const PxU32 warpIndex = blockIdx.x * blockStride + threadIdx.x/WARP_SIZE;
const PxU32 warpIndexInBlock = threadIdx.x/WARP_SIZE;
//This identifies which thread within a warp a specific thread is
const PxU32 threadIndexInWarp = threadIdx.x&(WARP_SIZE-1);
const PxU32 idx = threadIdx.x;
RadixAccum radixAccum;
#pragma unroll
for(PxU32 i=0; i< RADIX_ACCUM_SIZE; ++i)
{
radixAccum.radixAccum[i] = 0;
}
const PxU32 inputKeyIndex = PxMin(numIterationPerBlock * blockIdx.x * blockDim.x, numKeys);
const PxU32 endIndex = PxMin(inputKeyIndex + numIterationPerBlock*blockDim.x, numKeys);
const PxU32 count = endIndex - inputKeyIndex;
radixSortWarp(gInputKeys, idx, count, WARP_SIZE*WARP_PERBLOCK_SIZE, startBit, inputKeyIndex, gNumKeys, radixAccum);
PxU32 accumValue = 0;
#pragma unroll
for(PxU32 i=0; i<RADIX_SIZE; i+=2)
{
const PxU32 accum = radixAccum.radixAccum[i/2];
const PxU32 val = warpScanAdd<WARP_SIZE>(FULL_MASK, idx, threadIndexInWarp, (PxU32*)NULL, accum, accumValue);
const PxU32 val2 = __shfl_sync(FULL_MASK, (int)val, (WARP_SIZE - 1));// getLastElementValueInAWarp(val, idx, sData, WARP_SIZE);
if(threadIndexInWarp < 2)
{
sRadixSum[(i+threadIndexInWarp)*WARP_PERBLOCK_SIZE + idx/WARP_SIZE ] = (val2 >> (threadIndexInWarp*16)) & 0xFFFF;
}
}
__syncthreads();
//unsigned mask_warpIndexInBlock = __ballot_sync(syncMask, warpIndexInBlock < (WARP_PERBLOCK_SIZE / 2));
if (warpIndexInBlock < (WARP_PERBLOCK_SIZE/2))
{
const PxU32 originalValue = sRadixSum[idx];
const PxU32 output = scanRadixWarps<WARP_PERBLOCK_SIZE>(threadIndexInWarp, sRadixSum, originalValue, 0);//output is the value should be in sRadixSum[idx]
if((idx & (WARP_PERBLOCK_SIZE-1)) == (WARP_PERBLOCK_SIZE-1))
{
//copy to global memory
//const PxU32 gRadixIndex = blockIdx.x * RADIX_SIZE + idx;
const PxU32 gRadixIndex = blockIdx.x + idx/WARP_PERBLOCK_SIZE * gridDim.x;
//gRadixCount have 16 radix and each radix has 32 blocks. Each block in gRadixCount store the numbers of elements in each radix
gRadixCount[ gRadixIndex ] = output;
}
}
}
template <PxU32 WARP_PERBLOCK_SIZE>
__device__ inline void radixSortCalculateRanks(const uint4* PX_RESTRICT gInputKeys, const uint4* PX_RESTRICT gInputRanks, const PxU32 gNumOfKeys, const PxU32 startBit, PxU32* gRadixCount, PxU32* gOutputKeys, PxU32* gOutputRanks)
{
const PxU32 nbBlocks = PxgRadixSortKernelGridDim::RADIX_SORT;
PX_COMPILE_TIME_ASSERT(nbBlocks == 32);
__shared__ PxU32 sRadixSumBetweenBlocks[RADIX_SIZE];//how many 0 before 1
__shared__ PxU32 sRadixCountBetweenBlocks[RADIX_SIZE * (WARP_PERBLOCK_SIZE +2) + WARP_PERBLOCK_SIZE * WARP_SIZE];
PxU32* sBuckets = &sRadixCountBetweenBlocks[0];
PxU32* sRadixSum = sBuckets + WARP_PERBLOCK_SIZE * WARP_SIZE;
PxU32* sRadixSumSum = sRadixSum + RADIX_SIZE*WARP_PERBLOCK_SIZE;
PxU32* sRadixCount = sRadixSumSum + RADIX_SIZE;
__shared__ PxU32 sKeys[WARP_SIZE *WARP_PERBLOCK_SIZE * 4];
__shared__ PxU32 sRanks[WARP_SIZE * WARP_PERBLOCK_SIZE * 4];
//in the CPU we pass an array of PxU32 as the source inputKeys, therefore, we need to get the correct number of keys in GPU
const PxU32 numKeys = (gNumOfKeys+3)/4;
const PxU32 idx = threadIdx.x;
//const PxU32 gridThreadIdx = idx + blockIdx.x * blockDim.x;
const PxU32 warpIndexInBlock = threadIdx.x/WARP_SIZE;
//This identifies which thread within a warp a specific thread is
const PxU32 threadIndexInWarp = threadIdx.x&(WARP_SIZE-1);
scanRadixes<WARP_PERBLOCK_SIZE>(warpIndexInBlock, threadIndexInWarp, gRadixCount, sRadixCountBetweenBlocks, sRadixSumBetweenBlocks);
__syncthreads();
//accumulate total numbers of each radix in each warp inside the same block
unsigned mask_idx = __ballot_sync(FULL_MASK, idx < RADIX_SIZE);
if(idx < RADIX_SIZE)
{
const PxU32 value = sRadixSumBetweenBlocks[idx];
const PxU32 output = warpScanAdd<RADIX_SIZE>(mask_idx, idx, threadIndexInWarp, sRadixSumBetweenBlocks, value, value);
sRadixSumBetweenBlocks[idx] = output + sRadixCountBetweenBlocks[idx * nbBlocks + blockIdx.x];
}
__syncthreads();
const PxU32 totalBlockRequired = (numKeys + (blockDim.x-1))/ blockDim.x;
const PxU32 numIterationPerBlock = (totalBlockRequired + (nbBlocks-1))/ nbBlocks;
for(PxU32 i=0; i<numIterationPerBlock; ++i)
{
const PxU32 inputKeyIndex = i*WARP_SIZE*WARP_PERBLOCK_SIZE + idx + numIterationPerBlock * blockIdx.x * blockDim.x;
uint4 keyValue;
uint4 radix;
uint4 keyIndex;
uint4 radixOffset;
radixOffset.x = radixOffset.y = radixOffset.z = radixOffset.w = 0;
//read 4 elements at a time
if(inputKeyIndex < numKeys)
{
keyIndex = gInputRanks[inputKeyIndex];
keyValue = gInputKeys[inputKeyIndex];
sanitizeKeys(keyValue, inputKeyIndex * 4, gNumOfKeys);
radix = getRadix(keyValue, startBit);
}
else
{
//pad the extra radix with sufficent large enough number(we have 8 passes and each pass just sort 4 bits so 0xff is sufficent large enough)
radix.x = radix.y = radix.z = radix.w = 0xff;
}
//#pragma unroll
for(PxU32 i=0; i<RADIX_SIZE; i+=4)
{
PxU32 accum = (1u << ((radix.x - i) << 3));
accum += (1u << ((radix.y - i) << 3));
accum += (1u << ((radix.z - i) << 3));
accum += (1u << ((radix.w - i) << 3));
PxU32 val = warpScanAdd<WARP_SIZE, PxU32>(FULL_MASK, idx, threadIndexInWarp, sBuckets, accum, 0);
const PxU32 val2 = __shfl_sync(FULL_MASK, (int)val, (WARP_SIZE - 1)); //getLastElementValueInAWarp(val, idx, sBuckets, WARP_SIZE);
if(threadIndexInWarp < 4)
{
sRadixSum[(i+threadIndexInWarp)*WARP_PERBLOCK_SIZE + idx/WARP_SIZE ] = (val2 >> (8*threadIndexInWarp)) & 0xFF;
}
val -= accum;
//radix offset inside a warp
PxU32 shiftBits = (radix.x - i) << 3;
PxU32 offset = ((val >> shiftBits) & 0xFF);
radixOffset.x |= offset;
val += (1<<shiftBits);
shiftBits = (radix.y - i) << 3;
offset = ((val >> shiftBits) & 0xFF);
radixOffset.y |= offset;
val += (1<<shiftBits);
shiftBits = (radix.z - i) << 3;
offset = ((val >> shiftBits) & 0xFF);
radixOffset.z |= offset;
val += (1<<shiftBits);
shiftBits = (radix.w - i) << 3;
offset = ((val >> shiftBits) & 0xFF);
radixOffset.w |= offset;
}
__syncthreads();
PxU32 lastRadixSum = 0;
if(idx < RADIX_SIZE)
{
lastRadixSum = sRadixSum[idx*WARP_PERBLOCK_SIZE+(WARP_PERBLOCK_SIZE-1)];
}
__syncthreads();
//scan sRadixSum for a block
if(warpIndexInBlock < (WARP_PERBLOCK_SIZE/2))
{
const PxU32 tempVal = sRadixSum[idx];
sRadixSum[idx] = scanRadixWarps<WARP_PERBLOCK_SIZE>(threadIndexInWarp, sRadixSum, tempVal, tempVal);
}
__syncthreads();
unsigned mask_idx = __ballot_sync(FULL_MASK, idx < RADIX_SIZE);
if(idx < RADIX_SIZE)
{
const PxU32 value = sRadixSum[idx*WARP_PERBLOCK_SIZE+(WARP_PERBLOCK_SIZE-1)] + lastRadixSum;
sRadixCount[idx] = value;
sRadixSumSum[idx] = value;
__syncwarp(mask_idx);
warpScanAddWriteToSharedMem<RADIX_SIZE>(mask_idx, idx, threadIndexInWarp, sRadixSumSum, value, value);
}
__syncthreads();
if(idx < (WARP_PERBLOCK_SIZE * RADIX_SIZE))
{
sRadixSum[idx] += sRadixSumSum[idx/WARP_PERBLOCK_SIZE];
}
__syncthreads();
if(idx < RADIX_SIZE)
sRadixSumSum[idx] = sRadixSumBetweenBlocks[idx] - sRadixSumSum[idx];
if(inputKeyIndex < numKeys)
{
//radix offset between warps inside a block
radixOffset.x += sRadixSum[(WARP_PERBLOCK_SIZE * radix.x) + warpIndexInBlock];
radixOffset.y += sRadixSum[(WARP_PERBLOCK_SIZE * radix.y) + warpIndexInBlock];
radixOffset.z += sRadixSum[(WARP_PERBLOCK_SIZE * radix.z) + warpIndexInBlock];
radixOffset.w += sRadixSum[(WARP_PERBLOCK_SIZE * radix.w) + warpIndexInBlock];
sKeys[radixOffset.x] = keyValue.x;
sKeys[radixOffset.y] = keyValue.y;
sKeys[radixOffset.z] = keyValue.z;
sKeys[radixOffset.w] = keyValue.w;
sRanks[radixOffset.x] = keyIndex.x;
sRanks[radixOffset.y] = keyIndex.y;
sRanks[radixOffset.z] = keyIndex.z;
sRanks[radixOffset.w] = keyIndex.w;
}
__syncthreads();
const PxU32 baseInputKeyIndex = inputKeyIndex-idx;
if(baseInputKeyIndex < numKeys)
{
//If there were keys to process... The if statement defends against the PxU32 becoming huge and us overflowing the arrays
//const PxU32 keysToProcess = min(WARP_SIZE*WARP_PERBLOCK_SIZE*4, (numKeys - baseInputKeyIndex)*4);
const PxU32 keysToProcess = min(WARP_SIZE*WARP_PERBLOCK_SIZE * 4, (gNumOfKeys - baseInputKeyIndex*4));
for(PxU32 a = idx; a < keysToProcess; a += blockDim.x)
{
const PxU32 key = sKeys[a];
const PxU32 radix = getRadix(key, startBit);
const PxU32 writeIndex = a + sRadixSumSum[radix];
gOutputKeys[writeIndex] = key;
gOutputRanks[writeIndex] = sRanks[a];
}
}
__syncthreads();
if(idx < RADIX_SIZE)
{
sRadixSumBetweenBlocks[idx]+=sRadixCount[idx];
}
}
}
static __device__ void radixSortPassSingleWarp(const uint4* PX_RESTRICT gInputKeys, const uint4* PX_RESTRICT gInputRanks, const PxU32 gNumOfKeys, const PxU32 numUint4,
PxU32* PX_RESTRICT gOutputKeys, PxU32* PX_RESTRICT gOutputRanks, const PxU32 startBit, PxU32* radixExclusiveRunsum)
{
RadixAccum radixAccum;
#pragma unroll
for (PxU32 i = 0; i< RADIX_ACCUM_SIZE; ++i)
{
radixAccum.radixAccum[i] = 0;
}
radixSortWarp(gInputKeys, threadIdx.x, numUint4, WARP_SIZE, startBit, 0, gNumOfKeys, radixAccum);
PxU32 accumValue = 0;
#pragma unroll
for (PxU32 i = 0; i<RADIX_SIZE; i += 2)
{
const PxU32 accum = radixAccum.radixAccum[i / 2];
const PxU32 val = warpReduction<AddOpPxU32, PxU32>(FULL_MASK, accum);
const PxU32 v0 = val & 0xFFFF;
const PxU32 v1 = val >> 16;
radixExclusiveRunsum[i] = accumValue;
accumValue += v0;
radixExclusiveRunsum[i+1] = accumValue;
accumValue += v1;
}
//Now we loop and output the elements in order from the input buffer to the output buffer...
__syncwarp();
for (PxU32 i = 0; i < numUint4; i += WARP_SIZE)
{
const PxU32 inputKeyIndex = i + threadIdx.x;
//All threads enter this stage because we need to do some warp synchronous stuff...
uint4 keyValue;
uint4 radix;
uint4 keyIndex;
//read 4 elements at a time
if (inputKeyIndex < numUint4)
{
keyIndex = gInputRanks[inputKeyIndex];
keyValue = gInputKeys[inputKeyIndex];
sanitizeKeys(keyValue, inputKeyIndex * 4, gNumOfKeys);
radix = getRadix(keyValue, startBit);
}
else
{
//pad the extra radix with sufficent large enough number(we have 8 passes and each pass just sort 4 bits so 0xff is sufficent large enough)
radix.x = radix.y = radix.z = radix.w = 0xff;
}
//#pragma unroll
for (PxU32 i = 0; i<RADIX_SIZE; i += 4)
{
PxU32 radixRankX = (radix.x - i);
PxU32 radixRankY = (radix.y - i);
PxU32 radixRankZ = (radix.z - i);
PxU32 radixRankW = (radix.w - i);
PxU32 accum0 = (1u << (radixRankX << 3));
PxU32 accum1 = (1u << (radixRankY << 3));
PxU32 accum2 = (1u << (radixRankZ << 3));
PxU32 accum3 = (1u << (radixRankW << 3));
PxU32 accum = accum0 + accum1 + accum2 + accum3;
PxU32 val = warpScan<AddOpPxU32, PxU32>(FULL_MASK, accum);
const PxU32 val2 = __shfl_sync(FULL_MASK, val, (WARP_SIZE - 1));
//Take off how many I have so I have my local offset...
val -= accum;
if (accum)
{
//We have something in this radix range...output it!
if (radixRankX < 4)
{
PxU32 outputIndex = radixExclusiveRunsum[radix.x] + ((val >> (radixRankX<<3u))&0xFF);
val += (1 << (radixRankX << 3u));
gOutputKeys[outputIndex] = keyValue.x;
gOutputRanks[outputIndex] = keyIndex.x;
}
if (radixRankY < 4)
{
PxU32 outputIndex = radixExclusiveRunsum[radix.y] + ((val >> (radixRankY << 3u)) & 0xFF);
val += (1 << (radixRankY << 3u));
gOutputKeys[outputIndex] = keyValue.y;
gOutputRanks[outputIndex] = keyIndex.y;
}
if (radixRankZ < 4)
{
PxU32 outputIndex = radixExclusiveRunsum[radix.z] + ((val >> (radixRankZ << 3u)) & 0xFF);
val += (1 << (radixRankZ << 3u));
gOutputKeys[outputIndex] = keyValue.z;
gOutputRanks[outputIndex] = keyIndex.z;
}
if (radixRankW < 4)
{
PxU32 outputIndex = radixExclusiveRunsum[radix.w] + ((val >> (radixRankW << 3u)) & 0xFF);
val += (1 << (radixRankW << 3u));
gOutputKeys[outputIndex] = keyValue.w;
gOutputRanks[outputIndex] = keyIndex.w;
}
}
__syncwarp();
if (threadIdx.x == 0)
{
radixExclusiveRunsum[i] += (val2 & 0xFF);
radixExclusiveRunsum[i + 1] += ((val2 >> 8) & 0xFF);
radixExclusiveRunsum[i + 2] += ((val2 >> 16) & 0xFF);
radixExclusiveRunsum[i + 3] += ((val2 >> 24) & 0xFF);
}
__syncwarp();
}
}
}
static __device__ void radixSortPassSingleWarpKeysOnly(const uint4* PX_RESTRICT gInputKeys, const PxU32 gNumOfKeys, const PxU32 numUint4,
PxU32* PX_RESTRICT gOutputKeys, const PxU32 startBit, PxU32* radixExclusiveRunsum)
{
RadixAccum radixAccum;
#pragma unroll
for (PxU32 i = 0; i < RADIX_ACCUM_SIZE; ++i)
{
radixAccum.radixAccum[i] = 0;
}
radixSortWarp(gInputKeys, threadIdx.x, numUint4, WARP_SIZE, startBit, 0, gNumOfKeys, radixAccum);
PxU32 accumValue = 0;
#pragma unroll
for (PxU32 i = 0; i < RADIX_SIZE; i += 2)
{
const PxU32 accum = radixAccum.radixAccum[i / 2];
const PxU32 val = warpReduction<AddOpPxU32, PxU32>(FULL_MASK, accum);
const PxU32 v0 = val & 0xFFFF;
const PxU32 v1 = val >> 16;
radixExclusiveRunsum[i] = accumValue;
accumValue += v0;
radixExclusiveRunsum[i + 1] = accumValue;
accumValue += v1;
}
//Now we loop and output the elements in order from the input buffer to the output buffer...
__syncwarp();
for (PxU32 i = 0; i < numUint4; i += WARP_SIZE)
{
const PxU32 inputKeyIndex = i + threadIdx.x;
//All threads enter this stage because we need to do some warp synchronous stuff...
uint4 keyValue;
uint4 radix;
//read 4 elements at a time
if (inputKeyIndex < numUint4)
{
keyValue = gInputKeys[inputKeyIndex];
sanitizeKeys(keyValue, inputKeyIndex * 4, gNumOfKeys);
radix = getRadix(keyValue, startBit);
}
else
{
//pad the extra radix with sufficent large enough number(we have 8 passes and each pass just sort 4 bits so 0xff is sufficent large enough)
radix.x = radix.y = radix.z = radix.w = 0xff;
}
//#pragma unroll
for (PxU32 i = 0; i < RADIX_SIZE; i += 4)
{
PxU32 radixRankX = (radix.x - i);
PxU32 radixRankY = (radix.y - i);
PxU32 radixRankZ = (radix.z - i);
PxU32 radixRankW = (radix.w - i);
PxU32 accum0 = (1u << (radixRankX << 3));
PxU32 accum1 = (1u << (radixRankY << 3));
PxU32 accum2 = (1u << (radixRankZ << 3));
PxU32 accum3 = (1u << (radixRankW << 3));
PxU32 accum = accum0 + accum1 + accum2 + accum3;
PxU32 val = warpScan<AddOpPxU32, PxU32>(FULL_MASK, accum);
const PxU32 val2 = __shfl_sync(FULL_MASK, val, (WARP_SIZE - 1));
//Take off how many I have so I have my local offset...
val -= accum;
if (accum)
{
//We have something in this radix range...output it!
if (radixRankX < 4)
{
PxU32 outputIndex = radixExclusiveRunsum[radix.x] + ((val >> (radixRankX << 3u)) & 0xFF);
val += (1 << (radixRankX << 3u));
gOutputKeys[outputIndex] = keyValue.x;
}
if (radixRankY < 4)
{
PxU32 outputIndex = radixExclusiveRunsum[radix.y] + ((val >> (radixRankY << 3u)) & 0xFF);
val += (1 << (radixRankY << 3u));
gOutputKeys[outputIndex] = keyValue.y;
}
if (radixRankZ < 4)
{
PxU32 outputIndex = radixExclusiveRunsum[radix.z] + ((val >> (radixRankZ << 3u)) & 0xFF);
val += (1 << (radixRankZ << 3u));
gOutputKeys[outputIndex] = keyValue.z;
}
if (radixRankW < 4)
{
PxU32 outputIndex = radixExclusiveRunsum[radix.w] + ((val >> (radixRankW << 3u)) & 0xFF);
val += (1 << (radixRankW << 3u));
gOutputKeys[outputIndex] = keyValue.w;
}
}
__syncwarp();
if (threadIdx.x == 0)
{
radixExclusiveRunsum[i] += (val2 & 0xFF);
radixExclusiveRunsum[i + 1] += ((val2 >> 8) & 0xFF);
radixExclusiveRunsum[i + 2] += ((val2 >> 16) & 0xFF);
radixExclusiveRunsum[i + 3] += ((val2 >> 24) & 0xFF);
}
__syncwarp();
}
}
}
template <PxU32 nbWarps>
static __device__ void radixSortSingleWarp(uint4* PX_RESTRICT gInputKeys, uint4* PX_RESTRICT gInputRanks, const PxU32 gNumOfKeys, const PxU32 numUint4,
uint4* PX_RESTRICT gTempKeys, uint4* PX_RESTRICT gTempRanks, const PxU32 nbPasses)
{
__shared__ PxU32 radixExclusiveRunsum[nbWarps][RADIX_SIZE];
uint4* PX_RESTRICT k0 = gInputKeys;
uint4* PX_RESTRICT k1 = gTempKeys;
uint4* PX_RESTRICT r0 = gInputRanks;
uint4* PX_RESTRICT r1 = gTempRanks;
for (PxU32 i = 0, startBit = 0; i < nbPasses; ++i, startBit += 4)
{
radixSortPassSingleWarp(k0, r0, gNumOfKeys, numUint4, reinterpret_cast<PxU32*>(k1), reinterpret_cast<PxU32*>(r1), startBit,
radixExclusiveRunsum[threadIdx.y]);
//Swap buffers
uint4* PX_RESTRICT t = k0; k0 = k1; k1 = t;
t = r0; r0 = r1; r1 = t;
}
}
template <PxU32 nbWarps>
static __device__ void radixSortSingleWarpKeysOnly(uint4* PX_RESTRICT gInputKeys, const PxU32 gNumOfKeys, const PxU32 numUint4,
uint4* PX_RESTRICT gTempKeys, const PxU32 nbPasses)
{
__shared__ PxU32 radixExclusiveRunsum[nbWarps][RADIX_SIZE];
uint4* PX_RESTRICT k0 = gInputKeys;
uint4* PX_RESTRICT k1 = gTempKeys;
for (PxU32 i = 0, startBit = 0; i < nbPasses; ++i, startBit += 4)
{
radixSortPassSingleWarpKeysOnly(k0, gNumOfKeys, numUint4, reinterpret_cast<PxU32*>(k1), startBit,
radixExclusiveRunsum[threadIdx.y]);
//Swap buffers
uint4* PX_RESTRICT t = k0; k0 = k1; k1 = t;
}
}
/* bitonic sorting network for 32 inputs */
/* sorts in-place without extra storage */
PX_FORCE_INLINE __device__ void bitonicSortWarp(const PxU32 mask, PxU32& key, PxU32& val)
{
const PxU32 laneId = threadIdx.x & 0x1f;
/* only if the complete warp participates */
if (mask == UINT_MAX)
{
PxU32 sKey; PxReal sVal; bool swap;
for (int k = 2; k <= 32; k <<=1)
{
for (PxU32 stride = k / 2; stride > 0; stride >>= 1)
{
sKey = __shfl_xor_sync(mask, key, stride);
sVal = __shfl_xor_sync(mask, val, stride);
swap = (((laneId & stride) != 0 ? val > sVal : val < sVal))^((laneId&k) == 0);
key = swap ? sKey : key, val = swap ? sVal : val;
}
}
}
}
#endif // !PXG_RADIX_SORT_CUH

View File

@@ -0,0 +1,105 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef __CU_SPARSE_REMOVE_CUH__
#define __CU_SPARSE_REMOVE_CUH__
#include "reduction.cuh"
/**
This function initializes a keep-drop buffer. Assuming an array of size N is having K elements removed, it initializes the first (N-K) elements to 0 and the next (K) elements to 1.
*/
static __device__ void initializeKeepDropBuffer(PxU32* PX_RESTRICT globalRunSumBuffer, PxU32 totalCount, PxU32 nbToRemove)
{
const PxU32 newArraySize = totalCount - nbToRemove;
const PxU32 globalThreadIdx = threadIdx.x + WARP_SIZE * threadIdx.y + blockIdx.x * blockDim.x * blockDim.y;
for(PxU32 i = globalThreadIdx; i < totalCount; i += blockDim.x * blockDim.y * gridDim.x)
{
globalRunSumBuffer[i] = i < newArraySize ? 0 : 1;
}
}
/**
This function marks a keep-drop buffer based on an array of indices to remove. Assuming an array of length N with K elements being removed, this marks a 1 in any element in the first (N-K)
elements that is being removed with a 1. It marks any element in the last K elements being removed with a 0. This assumes that "initializeKeepDropBuffer" was performed on the array first
*/
static __device__ void markKeepDropBuff(const PxU32* PX_RESTRICT removeIndex, const PxU32 nbToRemove, PxU32* globalRunSumBuffer, const PxU32 totalCount)
{
const PxU32 newArraySize = totalCount - nbToRemove;
const PxU32 globalThreadIdx = threadIdx.x + WARP_SIZE * threadIdx.y + blockIdx.x * blockDim.x * blockDim.y;
for(PxU32 i = globalThreadIdx; i < nbToRemove; i += blockDim.x * blockDim.y * gridDim.x)
{
PxU32 index = removeIndex[i];
PxU32 mask = index < newArraySize ? 1 : 0;
globalRunSumBuffer[index] = mask;
}
}
template<PxU32 blockSize, PxU32 gridSize>
static __device__ void processKeepDropBuff(PxU32* PX_RESTRICT globalRunSumBuffer, const PxU32 totalCount, PxU32* crossBlockTotalAccumulator)
{
ReadArrayFunctor<PxU32> readF(globalRunSumBuffer);
WriteArrayFunctor<PxU32> writeF(globalRunSumBuffer);
scanKernel1of2<blockSize, gridSize, AddOpPxU32, PxU32, ReadArrayFunctor<PxU32>, WriteArrayFunctor<PxU32> >(
readF,
writeF,
totalCount,
crossBlockTotalAccumulator);
}
template<PxU32 gridSize>
static __device__ void accumulateKeepDrop(PxU32* PX_RESTRICT globalRunSumBuffer, const PxU32 totalCount, PxU32* crossBlockTotalAccumulator)
{
ReadArrayFunctor<PxU32> readF(globalRunSumBuffer);
WriteArrayFunctor<PxU32> writeArrayF(globalRunSumBuffer);
WriteValueNOPFunctor<PxU32> writeValueF;
scanKernel2of2<gridSize, AddOpPxU32, PxU32, ReadArrayFunctor<PxU32>, WriteArrayFunctor<PxU32>, WriteValueNOPFunctor<PxU32> >(
readF,
writeArrayF,
writeValueF,
totalCount,
crossBlockTotalAccumulator);
}
static __device__ PxU32 getNbSwapsRequired(const PxU32* PX_RESTRICT globalRunSumBuffer, const PxU32 originalCount, const PxU32 nbToRemove)
{
const PxU32 newTotalSize = originalCount - nbToRemove;
const PxU32 nbToReplaceInBuffer = globalRunSumBuffer[newTotalSize];
return nbToReplaceInBuffer;
}
static __device__ void getSwapIndices(const PxU32* PX_RESTRICT globalRunSumBuffer, const PxU32 totalSize, const PxU32 indexToFind, const PxU32 totalSwapsRequired,
PxU32& destIndex, PxU32& srcIndex)
{
destIndex = binarySearch(globalRunSumBuffer, totalSize, indexToFind);
srcIndex = binarySearch(globalRunSumBuffer, totalSize, indexToFind+totalSwapsRequired);
}
#endif

View File

@@ -0,0 +1,160 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef __CU_ATOMIC_CUH__
#define __CU_ATOMIC_CUH__
#include "cuda.h"
#include "foundation/PxVec3.h"
#include "foundation/PxSimpleTypes.h"
#include "PxgIntrinsics.h"
#include "PxgArticulation.h"
static __device__ inline void AtomicAdd(float4& a, const float4 b)
{
atomicAdd(&a.x, b.x);
atomicAdd(&a.y, b.y);
atomicAdd(&a.z, b.z);
atomicAdd(&a.w, b.w);
}
static __device__ inline void AtomicAdd(float4& a, const physx::PxVec3 b, const physx::PxReal w)
{
atomicAdd(&a.x, b.x);
atomicAdd(&a.y, b.y);
atomicAdd(&a.z, b.z);
atomicAdd(&a.w, w);
}
static __device__ inline void AtomicAdd(float4& a, const physx::PxVec3 b)
{
atomicAdd(&a.x, b.x);
atomicAdd(&a.y, b.y);
atomicAdd(&a.z, b.z);
}
__device__ inline void AtomicAdd(float* p, physx::PxU32 i, const physx::PxReal val)
{
atomicAdd(&p[i], val);
}
__device__ inline void AtomicAdd(float4* p, physx::PxU32 i, const physx::PxVec3& v, physx::PxReal w)
{
atomicAdd(&p[i].x, v.x);
atomicAdd(&p[i].y, v.y);
atomicAdd(&p[i].z, v.z);
atomicAdd(&p[i].w, w);
}
__device__ inline void AtomicAdd(float4* p, physx::PxU32 i, const physx::PxVec4& v)
{
atomicAdd(&p[i].x, v.x);
atomicAdd(&p[i].y, v.y);
atomicAdd(&p[i].z, v.z);
atomicAdd(&p[i].w, v.w);
}
__device__ inline void AtomicAdd(float4* p, physx::PxU32 i, const physx::PxVec3& v)
{
atomicAdd(&p[i].x, v.x);
atomicAdd(&p[i].y, v.y);
atomicAdd(&p[i].z, v.z);
}
__device__ inline void AtomicAdd3(float4* p, physx::PxU32 i, const float4& v)
{
atomicAdd(&p[i].x, v.x);
atomicAdd(&p[i].y, v.y);
atomicAdd(&p[i].z, v.z);
}
__device__ inline void AtomicAdd3(physx::PxVec3& p, const physx::PxVec3& v)
{
atomicAdd(&p.x, v.x);
atomicAdd(&p.y, v.y);
atomicAdd(&p.z, v.z);
}
__device__ inline float AtomicMin(float* address, float val)
{
int *address_as_int = (int*)address;
int old = *address_as_int, assumed;
while (val < __int_as_float(old))
{
assumed = old;
old = atomicCAS(address_as_int, assumed,
__float_as_int(val));
}
return __int_as_float(old);
}
inline __device__ float AtomicMax(float* address, float val)
{
int *address_as_int = (int*)address;
int old = *address_as_int, assumed;
while (val > __int_as_float(old))
{
assumed = old;
old = atomicCAS(address_as_int, assumed,
__float_as_int(val));
}
return __int_as_float(old);
}
//Some compiler was complaining about not supporting atomicOr on 64bit integers
PX_FORCE_INLINE static __device__ void AtomicOr(physx::PxU64* address, const physx::PxU64 mask)
{
physx::PxU32* address32 = reinterpret_cast<physx::PxU32*>(address);
const physx::PxU32* maskPtr = reinterpret_cast<const physx::PxU32*>(&mask);
atomicOr(address32, maskPtr[0]);
atomicOr(address32 + 1, maskPtr[1]);
}
/* use inline assembly with .global qualifier to perform the operation at the L2 cache
* adds 20% performance in FLIP P2G compared to atomicAdd() or plain red.add.f32 */
PX_FORCE_INLINE __device__ void PxRedAddGlobal(float* addr, const float val)
{
#if __CUDA_ARCH__ >= 350
asm volatile ("red.global.add.f32 [%0], %1;" :: __STG_PTR(addr) , "f"(val));
#else
#if __CUDA_ARCH__ >= 200
atomicAdd(addr, val);
#else
PX_UNUSED(addr);
PX_UNUSED(val);
#endif
#endif
}
#endif

View File

@@ -0,0 +1,191 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef __CU_CONTACT_REDUCTION_CUH__
#define __CU_CONTACT_REDUCTION_CUH__
#include "utils.cuh"
#include "foundation/PxSimpleTypes.h"
#include "foundation/PxVec3.h"
#include "shuffle.cuh"
#include "nputils.cuh"
namespace physx
{
//See trunk/internaldocumentation/Solver/PhysX 3 constraint solver.doc
//* If the number of points in the patch is more than 6, for each patch
//* We find the most extreme point, p0.This is the point that is farthest from the origin. ALGORITHM BELOW TAKES DEEPEST CONTACT AS p0.
//* We find the point farthest from the most extreme point : p1.
//* We find point p2, which is the point farthest from the segment p0p1.
//* We find the direction from p2 to the closest point to p2 on segment p0p1.We then find point p3 : the point farthest
// from p0p1 in that direction.
//* These 4 points define the anchors for our clusters.We then assign the points to their respective clusters, i.e.the
// cluster which the contact point is closest to.In a case where a point is equidistant between 2 anchors, the earlier anchor
// in the array of anchors is arbitrarily chosen.
//* We choose the deepest point in each cluster.We slightly bias the initial points p0 - p3 by considering them to have deeper
// penetrations than they actually have; these are biased by an epsilon.This avoids oscillation between points when they have roughly
// equal depth, which can cause instability in the friction model.The deepest contact point in each cluster is selected.
//* Finally, we choose 2 remaining contacts.These contacts are the 2 deepest unselected contacts.
template<bool TClustering, bool TKeepAnotherDeepestForPCM, int TMaxPoints, bool TDoDupeTest = true>
static __device__ int contactReduceShared(const PxVec3& pointRaw, const PxReal separation, const PxVec3& normal, int allMask,
PxReal clusterBias, PxReal distanceAdjustment, PxReal initialPointCriteria)
{
PxReal v, w;
const PxU32 threadIndexInWarp = threadIdx.x & 31;
if (TDoDupeTest)
{
bool imADupe = false;
for (PxU32 m = allMask; m; m = clearLowestSetBit(m))
{
int i = lowestSetIndex(m);
PxReal d = (shuffle(FULL_MASK, pointRaw, i) - pointRaw).magnitudeSquared();
PxReal sep = __shfl_sync(FULL_MASK, separation, i);
if (d < 1e-8f && (sep < separation || (sep == separation && i > threadIndexInWarp)))
{
imADupe = true;
}
}
allMask &= ~(__ballot_sync(FULL_MASK, imADupe));
}
PxU32 newCount = __popc(allMask);
if (newCount <= TMaxPoints)
{
return allMask;
}
const PxVec3 point = pointRaw - normal * pointRaw.dot(normal);
//Distance calculation is altered by separation value to give further away contacts less weight
int i0 = maxIndex(initialPointCriteria + distanceAdjustment, allMask, v); // p0 - most extreme contact (furthest away from origin)
int mask = 1 << i0;
PxReal dist = (shuffle(FULL_MASK, point, i0) - point).magnitude();
int i1 = maxIndex(dist + distanceAdjustment, allMask&~mask, v); // p1 - furthest from p0, when projected onto normal plane
mask |= 1 << i1;
//Now we have the p0-p1 edge. We try to find the point furthest from it in the normal plane.
//For that, we look for the 2 extreme points - one to the right and one to the left
//One maximizes [(p1 - p0) x n] * (p - p0), the other one minimizes that
//[(p1 - p0) x n] * (p - p0) = [n x (p - p0)] * (p1 - p0) = (n x p) * p1 - (n x p) * p0 - (n x p0) * p1 + (n x p0) * p0 =
//= k1 - k0 - k1[0], as (n x p0) * p0 = 0
PxVec3 dir = normal.cross(shuffle(FULL_MASK, point, i1) - shuffle(FULL_MASK, point, i0));
PxReal d = dir.dot(point - shuffle(FULL_MASK, point, i0));
int f = maxIndex(d + distanceAdjustment, allMask&~mask, v);
mask |= (1 << f);
int g = minIndex(d - distanceAdjustment, allMask&~mask, w);
//if (__shfl_sync(FULL_MASK, d, f) * __shfl_sync(FULL_MASK, d, g) > 0.f)
//{
// //We need to pick again...
// g = maxIndex(d, allMask&~mask, v);
//}
mask |= (1 << g);
//if (TKeepAnotherDeepestForPCM && __popc(mask) == 4)
bool predicate = (TKeepAnotherDeepestForPCM && __popc(mask) == 4);
//unsigned mask_predicate = __ballot_sync(FULL_MASK, predicate);
if (predicate && TMaxPoints > 4)
{
int i4 = minIndex(separation, allMask&~mask, v);
mask |= (1 << i4);
}
// post-cull clustering for mesh collisions
//unsigned mask_TClustering = __ballot_sync(syncMask, TClustering);
if (TClustering)
{
PxReal sep = separation;
if (mask & (1 << threadIndexInWarp))
sep -= clusterBias;
int nbClusters = 0, label = -1; // label each point with its closest cluster (distance measured orthogonal to the normal)
for (PxReal t = FLT_MAX; mask; nbClusters++, mask &= (mask - 1))
{
PxReal d = (point - shuffle(FULL_MASK, point, lowestSetIndex(mask))).magnitudeSquared();
if (d < t)
t = d, label = nbClusters;
}
mask = 0;
for (int i = 0; i < nbClusters; i++) // find a point in each cluster (clusters can be empty if all input points are equal)
{
int cluster = __ballot_sync(FULL_MASK, label == i)&allMask;
if (cluster)
mask |= 1 << minIndex(sep, cluster, v);
}
for (int i = nbClusters; i < TMaxPoints; i++) // fill out the rest of the points
mask |= 1 << minIndex(sep, allMask&~mask, v);
}
else
{
PxU32 count = __popc(mask);
for (PxU32 i = count; i < TMaxPoints; ++i)
mask |= 1 << minIndex(separation, allMask&~mask, v);
}
return mask;
}
template<bool TClustering, bool TKeepAnotherDeepestForPCM, int TMaxPoints, bool TDoDupeTest = true>
static __device__ int contactReduce(const PxVec3& pointRaw, const PxReal separation, const PxVec3& normal, int allMask,
PxReal clusterBias)
{
return contactReduceShared<TClustering, TKeepAnotherDeepestForPCM, TMaxPoints, TDoDupeTest>(pointRaw, separation, normal, allMask,
clusterBias, 0.0f, separation);
}
template<bool TClustering, bool TKeepAnotherDeepestForPCM, int TMaxPoints, bool TDoDupeTest = true>
static __device__ int contactReduce2(const PxVec3& pointRaw, const PxReal separation, const PxVec3& normal, int allMask,
PxReal clusterBias)
{
const PxVec3 point = pointRaw - normal * pointRaw.dot(normal);
PxReal distToOrigin = point.magnitude();
return contactReduceShared<TClustering, TKeepAnotherDeepestForPCM, TMaxPoints, TDoDupeTest>(pointRaw, separation, normal, allMask,
clusterBias, -separation, distToOrigin);
}
}
#endif

View File

@@ -0,0 +1,83 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef __CU_COPY_CUH__
#define __CU_COPY_CUH__
#include "foundation/PxSimpleTypes.h"
#include "PxgCommonDefines.h"
#include "cutil_math.h"
#include <assert.h>
template<typename T>
__device__ void warpCopy(T* dest, const T* source, const uint totalSize)
{
assert((size_t(dest) & (alignof(T)-1)) == 0);
assert((size_t(source) & (alignof(T)-1)) == 0);
assert(totalSize % sizeof(T) == 0);
const uint idxInWarp = threadIdx.x & (WARP_SIZE - 1);
const uint requiredThreads = totalSize / sizeof(T);
for (uint i = idxInWarp; i < requiredThreads; i += WARP_SIZE)
{
dest[i] = source[i];
}
}
template<typename T>
__device__ void warpCopy(T* dest, const T& value, const uint totalSize)
{
assert(((size_t(dest) & (alignof(T)-1)) == 0));
assert(totalSize % sizeof(T) == 0);
const uint idxInWarp = threadIdx.x & (WARP_SIZE - 1);
const uint requiredThreads = totalSize / sizeof(T);
for (uint i = idxInWarp; i < requiredThreads; i += WARP_SIZE)
{
dest[i] = value;
}
}
template<typename T>
__device__ void blockCopy(T* dest, const T* source, const uint totalSize)
{
assert((size_t(dest) & (alignof(T)-1)) == 0);
assert((size_t(source) & (alignof(T)-1)) == 0);
assert(totalSize % sizeof(T) == 0);
const uint requiredThreads = totalSize / sizeof(T);
for (uint i = threadIdx.x; i < requiredThreads; i += blockDim.x)
{
dest[i] = source[i];
}
}
#endif

View File

@@ -0,0 +1,70 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef __CU_FEM_CLOTH_MIDPHASESCRATCH_CUH__
#define __CU_FEM_CLOTH_MIDPHASESCRATCH_CUH__
#include "vector_types.h"
#define FEM_MIDPHASE_SCRATCH_SIZE 224 // 192 (WARP SIZE * 6) < 198 (sizeof(femMidphaseScratch)/sizeof(unsigned int)) < 224 (WARP SIZE * 7)
namespace physx
{
namespace Gu
{
struct BV32DataDepthInfo;
struct BV32DataPacked;
};
}
struct femMidphaseScratch
{
const float4* PX_RESTRICT meshVerts; // either tetrahedron mesh or triangle mesh
const uint4* PX_RESTRICT meshVertsIndices; // either tetrahedron mesh or triangle mesh
// const physx::Gu::BV32DataDepthInfo* PX_RESTRICT bv32DepthInfo;
// const unsigned int* PX_RESTRICT bv32RemapPackedNodeIndex;
// bv32 tree
const physx::Gu::BV32DataPacked* bv32PackedNodes;
// stack for traversal
int sBv32Nodes[192]; // 6 depth of the bv32 tree
};
PX_COMPILE_TIME_ASSERT(sizeof(femMidphaseScratch) <= WARP_SIZE * 7 * sizeof(unsigned int));
class femClothRefitMidphaseScratch : public femMidphaseScratch
{
public:
const physx::Gu::BV32DataDepthInfo* PX_RESTRICT bv32DepthInfo;
const unsigned int* PX_RESTRICT bv32RemapPackedNodeIndex;
};
PX_COMPILE_TIME_ASSERT(sizeof(femClothRefitMidphaseScratch) <= WARP_SIZE * 7 * sizeof(unsigned int));
#endif

View File

@@ -0,0 +1,142 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef __CU_GRID_CAL_CUH__
#define __CU_GRID_CAL_CUH__
#include "foundation/PxSimpleTypes.h"
PX_FORCE_INLINE static __device__ int3 calcGridPos(const float4& particlePos, const PxReal cellWidth)
{
int3 gridPos;
gridPos.x = floor((particlePos.x) / cellWidth);
gridPos.y = floor((particlePos.y) / cellWidth);
gridPos.z = floor((particlePos.z) / cellWidth);
return gridPos;
}
// calculate address in grid from position (wrap-around)
PX_FORCE_INLINE static __device__ physx::PxU32 calcGridHash(int3 gridPos, uint3 gridSize)
{
gridPos.x = gridPos.x & (gridSize.x - 1);
gridPos.y = gridPos.y & (gridSize.y - 1);
gridPos.z = gridPos.z & (gridSize.z - 1);
return ((gridPos.z * gridSize.y) * gridSize.x) + (gridPos.y * gridSize.x) + gridPos.x;
}
PX_FORCE_INLINE static __device__ PxU32 calcGridHashPeriodic(int3 gridPos, int3 gridSize, int3 periodGridSize)
{
//With periodic boundaries, gridPos for a particle should be >= 0 always. Particle positions were wrapped based on periodic grid size so
//the particles are within in the periodic range.
//When reading neighboring cells, we wrap using a simple < and >= check. We don't need modulo.
//We use the original grid size (non-periodic) to then work out the hash of the particle.
if (gridPos.x < 0)
gridPos.x += periodGridSize.x;
else if (gridPos.x >= periodGridSize.x)
gridPos.x -= periodGridSize.x;
if (gridPos.y < 0)
gridPos.y += periodGridSize.y;
else if (gridPos.y >= periodGridSize.y)
gridPos.y -= periodGridSize.y;
if (gridPos.z < 0)
gridPos.z += periodGridSize.z;
else if (gridPos.z >= periodGridSize.z)
gridPos.z -= periodGridSize.z;
return ((gridPos.z * gridSize.y) * gridSize.x) + (gridPos.y * gridSize.x) + gridPos.x;
}
/**
* takes a global grid range e.g. (-4 , 2, -1) to (-1, 3, 2) and computes a range size
* which is clamped to the wrapped grid size. The result is just used to compute cell hashes
* hence if the range in a dimension exceeds the wrapped grid size, all wrapped cells in that dimension
* are covered anyways. The clamp to 0 is probably there to catch numerical issues.
*/
PX_FORCE_INLINE static __device__ uint3 calcWrappedGridRangeSize(int3 gridRangeMin, int3 gridRangeMax, uint3 gridSize)
{
const PxU32 x = PxClamp(gridRangeMax.x - gridRangeMin.x + 1, 0, PxI32(gridSize.x));
const PxU32 y = PxClamp(gridRangeMax.y - gridRangeMin.y + 1, 0, PxI32(gridSize.y));
const PxU32 z = PxClamp(gridRangeMax.z - gridRangeMin.z + 1, 0, PxI32(gridSize.z));
return make_uint3(x, y, z);
}
/**
* Takes a linear cell offset and maps it to a 3D offset within a 3D grid range.
*/
PX_FORCE_INLINE static __device__ uint3 calcGridOffsetInRange(uint3 gridRangeSize, PxU32 offset)
{
const PxU32 x = offset % gridRangeSize.x;
const PxU32 y = (offset / gridRangeSize.x) % gridRangeSize.y;
const PxU32 z = offset / (gridRangeSize.x * gridRangeSize.y);
return make_uint3(x, y, z);
}
PX_FORCE_INLINE static __device__ void calcGridRange(int3& gridPosMin, int3& gridPosMax, const PxBounds3& bounds, float cellWidth)
{
float4 min = make_float4(bounds.minimum.x, bounds.minimum.y, bounds.minimum.z, 0.f);
float4 max = make_float4(bounds.maximum.x, bounds.maximum.y, bounds.maximum.z, 0.f);
gridPosMin = calcGridPos(min, cellWidth);
gridPosMax = calcGridPos(max, cellWidth);
}
#define PARTICLE_FORWARD_PROJECTION_STEP_SCALE_PGS 1.0f
#define PARTICLE_FORWARD_PROJECTION_STEP_SCALE_TGS 0.5f
#define PARTICLE_FORWARD_PROJECTION_STEP_SCALE_DIFFUSE 1.0f
/**
* Returns position and radius of the volume that needs to be tested for particle collision.
* The function reconstructs the full forward projection step in pre integration and selects the mid point and a
* radius that covers the whole projected motion range and adds the contact distance.
*/
PX_FORCE_INLINE static __device__ PxReal getParticleSpeculativeContactVolume(PxVec3& cVolumePos,
const PxVec3& currentPos, const PxVec3& predictedPos, const PxReal contactDist, const bool isDiffuse, const bool isTGS)
{
static const PxReal sScaleInvPGS = (1.0f / PARTICLE_FORWARD_PROJECTION_STEP_SCALE_PGS);
static const PxReal sScaleInvTGS = (1.0f / PARTICLE_FORWARD_PROJECTION_STEP_SCALE_TGS);
static const PxReal sScaleInvDiffuse = (1.0f / PARTICLE_FORWARD_PROJECTION_STEP_SCALE_DIFFUSE);
PxReal scaleInv;
if (isDiffuse)
{
scaleInv = sScaleInvDiffuse;
}
else
{
scaleInv = isTGS ? sScaleInvTGS : sScaleInvPGS;
}
PxVec3 cVolumeOffset = (predictedPos - currentPos)*scaleInv*0.5f;
cVolumePos = currentPos + cVolumeOffset;
return cVolumeOffset.magnitude() + contactDist;
}
#endif

View File

@@ -0,0 +1,252 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
#include "RadixSort.cuh"
#include "PxgRadixSortDesc.h"
#include "PxgRadixSortKernelIndices.h"
#include "stdio.h"
#include "PxNodeIndex.h"
extern "C" __host__ void initCommonKernels1() {}
//gNumOfKeys is device ptr
extern "C" __global__
__launch_bounds__(PxgRadixSortKernelBlockDim::RADIX_SORT, 1)
void radixSortMultiBlockLaunch(PxgRadixSortBlockDesc* desc, const PxU32 gStartBit)
{
const uint4* PX_RESTRICT gInputKeys = reinterpret_cast<uint4*>(desc[blockIdx.y].inputKeys);
const uint4* PX_RESTRICT gInputRanks = reinterpret_cast<uint4*>(desc[blockIdx.y].inputRanks);
PxU32* gRadixCount = desc[blockIdx.y].radixBlockCounts;
const PxU32 numKeys = *desc[blockIdx.y].numKeys;
radixSortSingleBlock<PxgRadixSortKernelBlockDim::RADIX_SORT / WARP_SIZE>(gInputKeys, gInputRanks, numKeys, gStartBit, gRadixCount);
}
//gNumOfKeys is device ptr
extern "C" __global__
__launch_bounds__(PxgRadixSortKernelBlockDim::RADIX_SORT, 1)
void radixSortMultiCalculateRanksLaunch(PxgRadixSortBlockDesc* desc, const PxU32 gStartBit)
{
const uint4* PX_RESTRICT gInputKeys = reinterpret_cast<uint4*>(desc[blockIdx.y].inputKeys);
const uint4* PX_RESTRICT gInputRanks = reinterpret_cast<uint4*>(desc[blockIdx.y].inputRanks);
PxU32* gOutputKeys = desc[blockIdx.y].outputKeys;
PxU32* gOutputRanks = desc[blockIdx.y].outputRanks;
PxU32* gRadixCount = desc[blockIdx.y].radixBlockCounts;
const PxU32 numKeys = *desc[blockIdx.y].numKeys;
radixSortCalculateRanks<PxgRadixSortKernelBlockDim::RADIX_SORT / WARP_SIZE>(gInputKeys, gInputRanks, numKeys, gStartBit, gRadixCount, gOutputKeys, gOutputRanks);
}
extern "C" __global__
__launch_bounds__(PxgRadixSortKernelBlockDim::RADIX_SORT, 1)
void radixSortMultiBlockLaunchWithoutCount(PxgRadixSortDesc* desc, const PxU32 gStartBit)
{
const uint4* PX_RESTRICT gInputKeys = reinterpret_cast<uint4*>(desc[blockIdx.y].inputKeys);
const uint4* PX_RESTRICT gInputRanks = reinterpret_cast<uint4*>(desc[blockIdx.y].inputRanks);
PxU32* gRadixCount = desc[blockIdx.y].radixBlockCounts;
radixSortSingleBlock<PxgRadixSortKernelBlockDim::RADIX_SORT / WARP_SIZE>(gInputKeys, gInputRanks, desc[blockIdx.y].count, gStartBit, gRadixCount);
}
//gNumOfKeys is device ptr
extern "C" __global__
__launch_bounds__(PxgRadixSortKernelBlockDim::RADIX_SORT, 1)
void radixSortMultiCalculateRanksLaunchWithoutCount(PxgRadixSortDesc* desc, const PxU32 gStartBit)
{
const uint4* PX_RESTRICT gInputKeys = reinterpret_cast<uint4*>(desc[blockIdx.y].inputKeys);
const uint4* PX_RESTRICT gInputRanks = reinterpret_cast<uint4*>(desc[blockIdx.y].inputRanks);
PxU32* gOutputKeys = desc[blockIdx.y].outputKeys;
PxU32* gOutputRanks = desc[blockIdx.y].outputRanks;
PxU32* gRadixCount = desc[blockIdx.y].radixBlockCounts;
radixSortCalculateRanks<PxgRadixSortKernelBlockDim::RADIX_SORT / WARP_SIZE>(gInputKeys, gInputRanks, desc[blockIdx.y].count, gStartBit, gRadixCount, gOutputKeys, gOutputRanks);
}
//gNumOfKeys is device ptr
extern "C" __global__
__launch_bounds__(PxgRadixSortKernelBlockDim::RADIX_SORT, 1)
void radixSortMultiBlockLaunchWithCount(PxgRadixSortDesc* desc, const PxU32 numKeys, const PxU32 gStartBit)
{
const uint4* PX_RESTRICT gInputKeys = reinterpret_cast<uint4*>(desc[blockIdx.y].inputKeys);
const uint4* PX_RESTRICT gInputRanks = reinterpret_cast<uint4*>(desc[blockIdx.y].inputRanks);
PxU32* gRadixCount = desc[blockIdx.y].radixBlockCounts;
radixSortSingleBlock<PxgRadixSortKernelBlockDim::RADIX_SORT / WARP_SIZE>(gInputKeys, gInputRanks, numKeys, gStartBit, gRadixCount);
}
//gNumOfKeys is device ptr
extern "C" __global__
__launch_bounds__(PxgRadixSortKernelBlockDim::RADIX_SORT, 1)
void radixSortMultiCalculateRanksLaunchWithCount(PxgRadixSortDesc* desc, const PxU32 numKeys, const PxU32 gStartBit)
{
const uint4* PX_RESTRICT gInputKeys = reinterpret_cast<uint4*>(desc[blockIdx.y].inputKeys);
const uint4* PX_RESTRICT gInputRanks = reinterpret_cast<uint4*>(desc[blockIdx.y].inputRanks);
PxU32* gOutputKeys = desc[blockIdx.y].outputKeys;
PxU32* gOutputRanks = desc[blockIdx.y].outputRanks;
PxU32* gRadixCount = desc[blockIdx.y].radixBlockCounts;
radixSortCalculateRanks<PxgRadixSortKernelBlockDim::RADIX_SORT / WARP_SIZE>(gInputKeys, gInputRanks, numKeys, gStartBit, gRadixCount, gOutputKeys, gOutputRanks);
}
extern "C" __global__ void radixSortCopyHigh32Bits(const PxU64* inValue, PxU32* outValue, PxU32* rank, const PxU32* numKeys)
{
const PxU32 numElems = *numKeys;
const PxU32 numIternations = (numElems + blockDim.x * gridDim.x - 1) / blockDim.x * gridDim.x;
for (PxU32 i = 0; i < numIternations; ++i)
{
const PxU32 workIndex = threadIdx.x + blockIdx.x * blockDim.x + i * blockDim.x * gridDim.x;
if (workIndex >= numElems)
return;
const PxU32 index = rank[workIndex];
outValue[workIndex] = PxU32(inValue[index] >> 32);
//printf("Copy 32 workIndex %i index %i blockDim.x %i gridDim.x %i\n", workIndex, index, blockDim.x, gridDim.x);
}
}
extern "C" __global__ void radixSortDoubleCopyHigh32Bits(const PxU64 * inValue0, PxU32 * outValue0, PxU32 * rank0, const PxU64 * inValue1, PxU32 * outValue1, PxU32 * rank1, const PxU32 * numKeys)
{
const PxU32 numElems = *numKeys;
const PxU32 numIternations = (numElems + blockDim.x * gridDim.x - 1) / blockDim.x * gridDim.x;
for (PxU32 i = 0; i < numIternations; ++i)
{
const PxU32 workIndex = threadIdx.x + blockIdx.x * blockDim.x + i * blockDim.x * gridDim.x;
if (workIndex >= numElems)
return;
const PxU32 index0 = rank0[workIndex];
outValue0[workIndex] = PxU32(inValue0[index0] >> 32);
const PxU32 index1 = rank1[workIndex];
outValue1[workIndex] = PxU32(inValue1[index1] >> 32);
//printf("Copy 32 workIndex %i index %i blockDim.x %i gridDim.x %i\n", workIndex, index, blockDim.x, gridDim.x);
}
}
extern "C" __global__ void radixSortCopy(const PxU64* inValue, PxU64* outValue, PxU32* rank, const PxU32* numKeys)
{
const PxU32 numElems = *numKeys;
const PxU32 numIternations = (numElems + blockDim.x * gridDim.x - 1) / blockDim.x * gridDim.x;
for (PxU32 i = 0; i < numIternations; ++i)
{
const PxU32 workIndex = threadIdx.x + blockIdx.x * blockDim.x + i * blockDim.x * gridDim.x;
if (workIndex >= numElems)
return;
const PxU32 index = rank[workIndex];
outValue[workIndex] = inValue[index];
//const PxNodeIndex nodeIndex = reinterpret_cast<const PxNodeIndex&>(inValue[index]);
//printf("Copy 64 workIndex %i index %i value %i blockDim.x %i gridDim.x %i\n", workIndex, index, nodeIndex.index(), blockDim.x, gridDim.x);
}
}
extern "C" __global__ void radixSortDoubleCopy(
const PxU64 * inValue0, PxU64 * outValue0, PxU32 * rank0,
const PxU64 * inValue1, PxU64 * outValue1, PxU32 * rank1,
const PxU32 * numKeys)
{
const PxU32 numElems = *numKeys;
const PxU32 numIternations = (numElems + blockDim.x * gridDim.x - 1) / blockDim.x * gridDim.x;
for (PxU32 i = 0; i < numIternations; ++i)
{
const PxU32 workIndex = threadIdx.x + blockIdx.x * blockDim.x + i * blockDim.x * gridDim.x;
if (workIndex >= numElems)
return;
const PxU32 index0 = rank0[workIndex];
outValue0[workIndex] = inValue0[index0];
const PxU32 index1 = rank1[workIndex];
outValue1[workIndex] = inValue1[index1];
//const PxNodeIndex nodeIndex = reinterpret_cast<const PxNodeIndex&>(inValue[index]);
//printf("Copy 64 workIndex %i index %i value %i blockDim.x %i gridDim.x %i\n", workIndex, index, nodeIndex.index(), blockDim.x, gridDim.x);
}
}
extern "C" __global__ void radixSortCopyBits2(const PxU64* inValue, PxU32* outValue, PxU32* rank, const PxU32 numKeys,
const bool lowBit)
{
const PxU64 lowerMask = 0x00000000ffffffffull;
const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
if (globalThreadIndex < numKeys)
{
const PxU32 index = rank[globalThreadIndex];
//this is aggregate, nodeIndex should be 0xffffffff
if (index == 0xffffffff)
outValue[globalThreadIndex] = 0xffffffff;
else
outValue[globalThreadIndex] = lowBit ? PxU32(inValue[index] & lowerMask) : PxU32(inValue[index] >> 32ll);
}
}
extern "C" __global__ void radixSortCopy2(const PxU64* inValue, PxU64* outValue, PxU32* rank, const PxU32 numKeys)
{
const PxU32 globalThreadIndex = threadIdx.x + blockDim.x * blockIdx.x;
if(globalThreadIndex < numKeys)
{
const PxU32 index = rank[globalThreadIndex];
const bool aggregate = (index == 0xffffffff);
outValue[globalThreadIndex] = aggregate ? 0xffffffffffffffff : inValue[index];
}
}

View File

@@ -0,0 +1,754 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef __CU_REDUCTION_CUH__
#define __CU_REDUCTION_CUH__
#include <float.h>
#include "foundation/PxPreprocessor.h"
#include "foundation/PxSimpleTypes.h"
#include "foundation/PxMath.h"
#include "PxgCommonDefines.h"
#include "assert.h"
#include "utils.cuh"
using namespace physx;
struct MinOpFloat
{
PX_CUDA_CALLABLE
static float defaultValue()
{
return FLT_MAX;
}
PX_CUDA_CALLABLE
static float op(float a, float b)
{
return fminf(a, b);
}
PX_CUDA_CALLABLE
static float op(unsigned int& retIdx, float a, unsigned int idxA, float b, unsigned int idxB)
{
if(b < a)
{
retIdx = idxB;
return b;
}
else
{
retIdx = idxA;
return a;
}
}
};
struct MaxOpFloat
{
PX_CUDA_CALLABLE
static inline float defaultValue()
{
return -FLT_MAX;
}
PX_CUDA_CALLABLE
static inline float op(float a, float b)
{
return fmaxf(a, b);
}
PX_CUDA_CALLABLE
static float op(unsigned int& retIdx, float a, unsigned int idxA, float b, unsigned int idxB)
{
if(b > a)
{
retIdx = idxB;
return b;
}
else
{
retIdx = idxA;
return a;
}
}
};
struct MaxOpPxU32
{
PX_CUDA_CALLABLE
static inline PxU32 op(const PxU32 a, const PxU32 b)
{
return max(a, b);
}
};
struct AndOpPxU32
{
PX_CUDA_CALLABLE
static PxU32 defaultValue()
{
return 0xFFffFFff;
}
PX_CUDA_CALLABLE
static PxU32 op(PxU32 a, PxU32 b)
{
return a & b;
}
};
struct AddOpPxU32
{
PX_CUDA_CALLABLE
static PxU32 defaultValue()
{
return 0ul;
}
PX_CUDA_CALLABLE
static PxU32 op(PxU32 a, PxU32 b)
{
return a + b;
}
};
struct AddOpPxReal
{
PX_CUDA_CALLABLE
static PxReal defaultValue()
{
return 0ul;
}
PX_CUDA_CALLABLE
static PxReal op(PxReal a, PxReal b)
{
return a + b;
}
};
struct AddOpPxI32
{
PX_CUDA_CALLABLE
static PxI32 defaultValue()
{
return 0l;
}
PX_CUDA_CALLABLE
static PxI32 op(PxI32 a, PxI32 b)
{
return a + b;
}
};
struct OrOpPxU32
{
PX_CUDA_CALLABLE
static PxU32 defaultValue()
{
return 0;
}
PX_CUDA_CALLABLE
static PxU32 op(PxU32 a, PxU32 b)
{
return a | b;
}
};
//This isn't a runsum. It will produce the sum and spat the result to all the active threads
template<typename OP, typename T, PxU32 log2threadGroupSize>
__device__ static inline T warpReduction(const PxU32 syncMask, T input)
{
const PxU32 threadGroupSize = (1U << log2threadGroupSize);
#pragma unroll
for(PxU32 reductionRadius = threadGroupSize >> 1; reductionRadius > 0; reductionRadius >>= 1)
{
T val = __shfl_xor_sync(syncMask, input, reductionRadius, threadGroupSize);
input = OP::op(input, val);
}
return input;
}
//This isn't a runsum. It will produce the sum and spat the result to all the active threads
template<typename OP, typename T>
__device__ static inline T warpReduction(const PxU32 syncMask, T input)
{
return warpReduction<OP, T, LOG2_WARP_SIZE>(syncMask, input);
}
//makes sense only for comparison operations that don't alter the value. Expect -1 if the op is altering inputs
template<typename OP, typename T, PxU32 log2threadGroupSize>
__device__ static inline T warpReduction(const PxU32 syncMask, const T& input, PxU32& winnerLaneIndex)
{
T best = warpReduction<OP, T, log2threadGroupSize>(syncMask, input);
winnerLaneIndex = lowestSetIndex(__ballot_sync(syncMask, best == input));
return best;
}
template<typename OP, typename T>
__device__ static inline T warpReduction(const PxU32 syncMask, const T& input, PxU32& winnerLaneIndex)
{
return warpReduction<OP, T, LOG2_WARP_SIZE>(syncMask, input, winnerLaneIndex);
}
template<typename OP, typename T>
__device__ static inline T blockReduction(const PxU32 syncMask, const T& input, const T& initialValue, const PxU32 blockSize, volatile T* sharedMemoryOneEntryPerWarp)
{
const PxU32 numWarpsPerBlock = blockSize / WARP_SIZE;
const PxU32 warpIndex = threadIdx.x / WARP_SIZE;
const PxU32 threadIndexInWarp = threadIdx.x & (WARP_SIZE - 1);
T warpResult = warpReduction<OP, T>(syncMask, input);
if (threadIndexInWarp == 0)
{
sharedMemoryOneEntryPerWarp[warpIndex] = warpResult;
}
__syncthreads();
T val = (threadIdx.x < numWarpsPerBlock) ? sharedMemoryOneEntryPerWarp[threadIndexInWarp] : initialValue;
if (warpIndex == 0)
return warpReduction<OP, T>(syncMask, val);
else
return initialValue;
}
template<typename OP, typename T, const PxU32 blockSize>
__device__ static inline T blockReduction(const PxU32 syncMask, const T& input, const T& initialValue)
{
const PxU32 numWarpsPerBlock = blockSize / WARP_SIZE;
volatile __shared__ T sData[numWarpsPerBlock];
return blockReduction<OP, T>(syncMask, input, initialValue, blockSize, sData);
}
//inclusive scan
template<typename OP, typename T, PxU32 log2threadGroupSize>
__device__ static inline T warpScan(const PxU32 syncMask, T input)
{
const PxU32 threadGroupSize = (1U << log2threadGroupSize);
const PxU32 idxInGroup = threadIdx.x & (threadGroupSize-1);
#pragma unroll
for(PxU32 reductionRadius = 1; reductionRadius < threadGroupSize; reductionRadius <<= 1)
{
T val = __shfl_up_sync(syncMask, input, reductionRadius, threadGroupSize);
if (idxInGroup >= reductionRadius)
input = OP::op(input, val);
}
return input;
}
//inclusive scan
template<typename OP, typename T>
__device__ static inline T warpScan(const PxU32 syncMask, T input)
{
return warpScan<OP, T, LOG2_WARP_SIZE>(syncMask, input);
}
//exclusive scan
template<typename OP, typename T>
__device__ static inline T warpScanExclusive(T input)
{
T output = OP::defaultValue();
const PxU32 idxInGroup = threadIdx.x & (WARP_SIZE - 1);
#pragma unroll
for (PxU32 reductionRadius = 1; reductionRadius < WARP_SIZE; reductionRadius <<= 1)
{
T val = __shfl_up_sync(FULL_MASK, input, reductionRadius);
if (idxInGroup >= reductionRadius)
{
input = OP::op(input, val);
output = OP::op(output, val);
}
}
return output;
}
template<typename T>
class ReadArrayFunctor
{
public:
PX_CUDA_CALLABLE
ReadArrayFunctor(const T* PX_RESTRICT arr): mArr(arr) {}
PX_CUDA_CALLABLE
T operator()(PxU32 idx) const {return mArr[idx];}
protected:
const T* mArr;
};
template<typename T>
class WriteArrayFunctor
{
public:
PX_CUDA_CALLABLE
WriteArrayFunctor(T* PX_RESTRICT arr): mArr(arr) {}
PX_CUDA_CALLABLE
void operator()(PxU32 idx, const T& val) {mArr[idx] = val;}
protected:
T* mArr;
};
template<typename T>
class WriteValueFunctor
{
public:
PX_CUDA_CALLABLE
WriteValueFunctor(T* PX_RESTRICT addr): mAddr(addr) {}
PX_CUDA_CALLABLE
void operator()(const T& val) {*mAddr = val;}
protected:
T* mAddr;
};
template<typename T>
class WriteValueNOPFunctor
{
public:
PX_CUDA_CALLABLE
WriteValueNOPFunctor() {}
PX_CUDA_CALLABLE
void operator()(const T&) {}
};
template<PxU32 blockSize, PxU32 gridSize, typename OP, typename T, typename GetInputFunctor, typename SetTempFunctor>
static __device__ void scanKernel1of2(
const GetInputFunctor& getInputF,
SetTempFunctor& setTempF,
const PxU32 totalCount,
T* crossBlockTotalAccumulator)
{
__shared__ T crossWarpAccumulator[blockSize >> LOG2_WARP_SIZE];
__shared__ T accum;
if(threadIdx.x == 0)
accum = OP::defaultValue();
__syncthreads();
const PxU32 nbThreads = blockSize;
const PxU32 nbBlocksRequired = (totalCount + (nbThreads-1))/nbThreads;
const PxU32 nbBlocksPerBlock = (nbBlocksRequired + gridDim.x-1)/gridDim.x;
const PxU32 blockStartIndex = blockIdx.x * nbBlocksPerBlock;
const PxU32 threadIndexInWarp = threadIdx.x;
const PxU32 warpIndexInBlock = threadIdx.y;
for (PxU32 i = 0; i < nbBlocksPerBlock; ++i)
{
const PxU32 threadIndex = threadIdx.x + WARP_SIZE * threadIdx.y + (blockStartIndex + i) * blockDim.x * blockDim.y;
T val = OP::defaultValue();
if (threadIndex < totalCount)
val = getInputF(threadIndex);
T res = warpScanExclusive<OP, T>(val);
if (threadIndexInWarp == (WARP_SIZE - 1))
crossWarpAccumulator[warpIndexInBlock] = OP::op(res, val);
T prevAccum = accum;
__syncthreads();
if (warpIndexInBlock == 0)
{
T val2 = OP::defaultValue();
if (threadIndexInWarp < (blockSize >> LOG2_WARP_SIZE))
val2 = crossWarpAccumulator[threadIndexInWarp];
T res2 = warpScanExclusive<OP, T>(val2);
if (threadIndexInWarp < (blockSize >> LOG2_WARP_SIZE))
crossWarpAccumulator[threadIndexInWarp] = res2;
if (threadIndexInWarp == ((blockSize >> LOG2_WARP_SIZE) - 1))
{
accum = OP::op(accum, OP::op(res2, val2));
}
}
__syncthreads();
if (threadIndex < totalCount)
setTempF(threadIndex, OP::op(res, OP::op(crossWarpAccumulator[warpIndexInBlock], prevAccum)));
}
if ((threadIdx.y * WARP_SIZE + threadIdx.x) == ((blockSize >> LOG2_WARP_SIZE)-1))
{
crossBlockTotalAccumulator[blockIdx.x] = accum;
}
}
template<PxU32 gridSize, typename OP, typename T, typename GetTempFunctor, typename SetOutputFunctor,
typename WriteGrandTotalFunctor>
static __device__ void scanKernel2of2(
const GetTempFunctor& getTempF,
SetOutputFunctor& setOutF,
WriteGrandTotalFunctor& writeTotalF,
const PxU32 totalCount,
const T* crossBlockTotalAccumulator)
{
const PxU32 nbThreads = blockDim.x * blockDim.y;
const PxU32 nbBlocksRequired = (totalCount + (nbThreads-1))/nbThreads;
const PxU32 nbBlocksPerBlock = (nbBlocksRequired + gridDim.x-1)/gridDim.x;
const PxU32 blockStartIndex = blockIdx.x * nbBlocksPerBlock;
__shared__ T blockAccum[gridSize];
const PxU32 threadIndexInWarp = threadIdx.x;
if (threadIdx.y == 0)
{
T val = OP::defaultValue();
if (threadIdx.x < gridSize)
val = crossBlockTotalAccumulator[threadIndexInWarp];
T res = warpScanExclusive<OP, T>(val);
if (threadIdx.x < gridSize)
blockAccum[threadIndexInWarp] = res;
if (threadIdx.x == gridSize - 1 && blockIdx.x == 0)
{
writeTotalF(OP::op(val, res));
}
}
__syncthreads();
T accumulation = blockAccum[blockIdx.x];
for(PxU32 i = 0; i < nbBlocksPerBlock; ++i)
{
const PxU32 threadIndex = threadIdx.x + WARP_SIZE * threadIdx.y + (blockStartIndex + i) * blockDim.x * blockDim.y;
if(threadIndex < totalCount)
{
T val = OP::op(getTempF(threadIndex), accumulation);
setOutF(threadIndex, val);
}
}
}
//keeping this for the broadphase:
//This is the parallel version of sum.
template<PxU32 nbElems, typename T>
static __inline__ __device__ T warpScanAdd(const PxU32 syncMask, const PxU32 /*index*/, const PxU32 threadIndexInWarp, T* /*sData*/, const T originalValue, const T value)
{
unsigned mask_local = __ballot_sync(syncMask, threadIndexInWarp < nbElems);
if(threadIndexInWarp < nbElems)
{
T val = originalValue;
#pragma unroll
for(PxU32 i = 1; i < nbElems; i<<=1)
{
const T temp = __shfl_sync(mask_local, val, threadIndexInWarp-i);
if(threadIndexInWarp >= i)
val += temp;
}
return val - value;
}
return 0;
}
template<PxU32 nbElems>
static __inline__ __device__ PxU32 warpScanMax(const PxU32 syncMask, const PxU32 /*index*/, const PxU32 threadIndexInWarp, PxU32* /*sData*/, const PxU32 originalValue)
{
unsigned mask_local = __ballot_sync(syncMask, threadIndexInWarp < nbElems);
if(threadIndexInWarp < nbElems)
{
PxU32 val = originalValue;
PxU32 mask = nbElems < 32 ? (1 << nbElems) - 1 : 0xffffffff;
#pragma unroll
for(PxU32 i = 1; i < nbElems; i<<=1)
{
const PxU32 temp = __shfl_sync(mask_local, (int)val, threadIndexInWarp-i);
if(threadIndexInWarp >= i)
val = PxMax(temp, val);
}
return val;
}
return 0;
}
//This is the parallel version of exclusive sum. We have 32 thread in a warp, so that we need to
//have 2 exp 5 step(16) of add
template<PxU32 nbElems, typename T>
static __inline__ __device__ T warpScanAddWriteToSharedMem(PxU32 syncMask, PxU32 index, PxU32 threadIndexInWarp, T* sData, T originalValue, T value)
{
unsigned mask_local = __ballot_sync(syncMask, threadIndexInWarp < nbElems);
if(threadIndexInWarp < nbElems)
{
T temp = 0;
T val = originalValue;
#pragma unroll
for(PxU32 i = 1; i < nbElems; i<<=1)
{
temp = __shfl_sync(mask_local, val, threadIndexInWarp-i);
if(threadIndexInWarp >= i)
val += temp;
}
val -= value;
sData[index] = val;
__syncwarp(mask_local); //Mem fence for shared data write
return val;
}
return 0;
}
PX_FORCE_INLINE __device__ PxU32 warpCountAndBroadcast(PxU32 mask, bool threadContributesElement)
{
return __popc(__ballot_sync(mask, threadContributesElement));
}
PX_FORCE_INLINE __device__ PxU32 threadBlockCountAndBroadcast(bool threadContributesElement, PxU32* sharedMem)
{
const PxU32 threadIndexInWarp = threadIdx.x & 31;
const PxU32 warpIndex = threadIdx.x >> 5; // threadIdx.x / 32;
PxU32 perWarpCount = warpCountAndBroadcast(FULL_MASK, threadContributesElement);
if (threadIndexInWarp == 0)
sharedMem[warpIndex] = perWarpCount;
__syncthreads();
return warpReduction<AddOpPxU32, PxU32>(FULL_MASK, sharedMem[threadIndexInWarp]);
}
// Performs an exclusive scan over a warp. Every thread can only contribute 0 or 1 to the scan through the mask
PX_FORCE_INLINE __device__ PxU32 warpScanExclusive(PxU32 mask, PxU32 threadIndexInWarp)
{
return __popc(mask & ((1 << threadIndexInWarp) - 1));
}
//Performs an exclusive scan over a warp. Every thread can only contribute 0 or 1 to the sum. The return value will be the exclusive cumulative sum for every thread. totalSum will provide the total number of contributed elements.
//perWarpCountS must be shared memory with NbWarps entries available
template<PxU32 NbWarps>
PX_FORCE_INLINE __device__ PxU32 threadBlockScanExclusive(bool threadContributesElement, PxU32& totalSum, PxU32* perWarpCountS)
{
const PxU32 threadIndexInWarp = threadIdx.x & 31;
const PxU32 warpIndex = threadIdx.x >> 5; // threadIdx.x / 32;
PxU32 ballotMask = __ballot_sync(FULL_MASK, threadContributesElement);
PxU32 nbInteresting = __popc(ballotMask); //The number of elements emitted per warp
if (threadIndexInWarp == 0)
perWarpCountS[warpIndex] = nbInteresting;
__syncthreads();
PxU32 warpCount = threadIndexInWarp < NbWarps ? perWarpCountS[threadIndexInWarp] : 0;
PxU32 total = warpScan<AddOpPxU32, PxU32>(FULL_MASK, warpCount); //The inclusive cumulative sum per warp. The last warp has the total number of elements created by the full thread block
PxU32 carryExclusiveScan = __shfl_sync(FULL_MASK, total, warpIndex) - perWarpCountS[warpIndex]; //Broadcast the exclusive cumulative base sum (inclusiveSum - perWarpCount = exclusiveSum)
totalSum = __shfl_sync(FULL_MASK, total, 31); //Broadcast the total sum of the last warp (which is the overall total sum) to all warps
return carryExclusiveScan + warpScanExclusive(ballotMask, threadIndexInWarp); //Combine base sum and the sum per warp
}
template<PxU32 NbWarps>
PX_FORCE_INLINE __device__ PxU32 threadBlockScanExclusive(bool threadContributesElement, PxU32& totalSum)
{
__shared__ PxU32 perWarpCountS[NbWarps];
return threadBlockScanExclusive<NbWarps>(threadContributesElement, totalSum, perWarpCountS);
}
//Performs an exclusive scan over a warp. Every thread can contribute an arbitrary value to the sum. The return value will be the exclusive cumulative sum for every thread. totalSum will provide the total number of contributed elements.
//perWarpCountS must be shared memory with NbWarps entries available
template<PxU32 NbWarps>
PX_FORCE_INLINE __device__ PxU32 threadBlockScanExclusive(PxU32 numElementsFromCallingThread, PxU32& totalSum, PxU32* perWarpCountS)
{
const PxU32 threadIndexInWarp = threadIdx.x & 31;
const PxU32 warpIndex = threadIdx.x >> 5; // threadIdx.x / 32;
PxU32 perWarpInclusiveScan = warpScan<AddOpPxU32, PxU32>(FULL_MASK, numElementsFromCallingThread);
if (threadIndexInWarp == 31)
perWarpCountS[warpIndex] = perWarpInclusiveScan;
__syncthreads();
PxU32 warpCount = threadIndexInWarp < NbWarps ? perWarpCountS[threadIndexInWarp] : 0;
PxU32 total = warpScan<AddOpPxU32, PxU32>(FULL_MASK, warpCount); //The inclusive cumulative sum per warp. The last warp has the total number of elements created by the full thread block
PxU32 carryExclusiveScan = __shfl_sync(FULL_MASK, total, warpIndex) - perWarpCountS[warpIndex]; //Broadcast the exclusive cumulative base sum (inclusiveSum - perWarpCount = exclusiveSum)
totalSum = __shfl_sync(FULL_MASK, total, 31); //Broadcast the total sum of the last warp (which is the overall total sum) to all warps
return carryExclusiveScan + perWarpInclusiveScan - numElementsFromCallingThread; //Combine base sum and the sum per warp
}
template<PxU32 NbWarps>
PX_FORCE_INLINE __device__ PxU32 threadBlockScanExclusive(PxU32 numElementsFromCallingThread, PxU32& totalSum)
{
__shared__ PxU32 perWarpCountS[NbWarps];
return threadBlockScanExclusive<NbWarps>(numElementsFromCallingThread, totalSum, perWarpCountS);
}
//Allows to get indices in an output array for every thread (even accross thread blocks) where every thread either emits an element or not.
//Only threads that contribute an element will get a valid index returned which is expected since all other threads don't output anything.
//The order of the ouput indices is not deterministic since atomic add is used
//The method must be called by one or preferrably multiple full warps, for a single warp see optimized version below
template<PxU32 NbWarps>
PX_FORCE_INLINE __device__ PxU32 globalScanExclusive(bool threadContributesElement, PxU32* atomicCounter)
{
__syncthreads();
__shared__ PxU32 perWarpCountS[NbWarps];
PxU32 threadBlockTotalSum;
PxU32 indexInThreadBlock = threadBlockScanExclusive<NbWarps>(threadContributesElement, threadBlockTotalSum, perWarpCountS);
__shared__ PxU32 globalOffset;
if (threadIdx.x == 0)
{
//Only one thread per thread block needs to perform an atomic add
globalOffset = atomicAdd(atomicCounter, threadBlockTotalSum);
}
__syncthreads();
return indexInThreadBlock + globalOffset;
}
//Optimized version where a thread block only consist out of one warp. Does not need shared memory.
//Allows to get indices in an output array for every thread (even accross thread blocks) where every thread either emits an element or not.
//Only threads that contribute an element will get a valid index returned which is expected since all other threads don't output anything.
//The order of the ouput indices is not deterministic since atomic add is used
//The method must be called by one full warp
PX_FORCE_INLINE __device__ PxU32 globalScanExclusiveSingleWarp(bool threadContributesElement, PxU32* atomicCounter)
{
PxU32 idxInWarp = threadIdx.x & 31;
const PxU32 resultWarp = __ballot_sync(FULL_MASK, threadContributesElement);
const PxU32 offset = warpScanExclusive(resultWarp, idxInWarp); // __popc(resultWarp & ((1 << idxInWarp) - 1));
const PxU32 validCount = __popc(resultWarp);
PxU32 startIndex = 0xFFffFFff - 32; // -32 to prevent wrap-around
if (idxInWarp == 0 && validCount > 0)
{
startIndex = atomicAdd(atomicCounter, validCount);
}
return __shfl_sync(FULL_MASK, startIndex, 0) + offset;
}
//Overrides "globalScanExclusiveSingleWarp" to support adding multiple elements per thread, rather than limiting to a single element.
PX_FORCE_INLINE __device__ PxU32 globalScanExclusiveSingleWarp(PxU32 numElements, PxU32* atomicCounter)
{
const PxU32 idxInWarp = threadIdx.x & 31;
const PxU32 offset = warpScanExclusive<AddOpPxU32, PxU32>(numElements);
PxU32 startIndex = 0xFFffFFff - 32; // -32 to prevent wrap-around
if(idxInWarp == 31)
{
PxU32 totalSum = offset + numElements; // When executed by the last thread in the warp, this is the total sum of all
// numElements over the whole warp
if(totalSum > 0)
startIndex = atomicAdd(atomicCounter, totalSum);
}
return __shfl_sync(FULL_MASK, startIndex, 31) + offset;
}
// returns the largest index into the (sorted!) data array s.t. data[index] <= value
// if there is no such index (i.e., data[0] > value) returns 0
template<class T>
static __device__ PxU32 binarySearch(const T* PX_RESTRICT data, const PxU32 numElements, const T& value)
{
PxU32 left = 0;
PxU32 right = numElements;
//while((right - left) > 1)
while(left < right)
{
PxU32 pos = (left + right) / 2;
const T& element = data[pos];
if (element <= value)
{
//Found exact value
left = pos+1;
}
else
{
right = pos;
}
}
return left ? left - 1 : 0;
}
#endif //CU_REDUCTION_CU

View File

@@ -0,0 +1,60 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef __CU_SB_MIDPHASESCRATCH_CUH__
#define __CU_SB_MIDPHASESCRATCH_CUH__
#include "vector_types.h"
namespace physx
{
namespace Gu
{
struct BV32DataDepthInfo;
struct BV32DataPacked;
};
};
struct sbMidphaseScratch
{
const float4 * PX_RESTRICT tetmeshVerts;
const uint4 * PX_RESTRICT tetmeshTetIndices;
const PxU8* PX_RESTRICT tetmeshSurfaceHint;
const Gu::BV32DataDepthInfo* PX_RESTRICT bv32DepthInfo;
const PxU32* PX_RESTRICT bv32RemapPackedNodeIndex;
//bv32 tree
Gu::BV32DataPacked* bv32PackedNodes;
//stack for traversal
int sBv32Nodes[192]; //6 depth of the bv32 tree
};
PX_COMPILE_TIME_ASSERT(sizeof(sbMidphaseScratch) <= WARP_SIZE * 7 * sizeof(PxU32));
#endif

View File

@@ -0,0 +1,110 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef __CU_SHUFFLE_CUH__
#define __CU_SHUFFLE_CUH__
#include "cuda.h"
#include "PxgCommonDefines.h"
//#include "nputils.cuh"
static __device__ __forceinline__
physx::PxVec3 shuffle(const physx::PxU32 syncMask, const physx::PxVec3& v, int i, physx::PxU32 width = WARP_SIZE)
{
return physx::PxVec3(__shfl_sync(syncMask, v.x, i, width), __shfl_sync(syncMask, v.y, i, width), __shfl_sync(syncMask, v.z, i, width));
}
static __device__ __forceinline__
float4 shuffle(const physx::PxU32 syncMask, const float4& v, const int lane)
{
return make_float4(__shfl_sync(syncMask, v.x, lane), __shfl_sync(syncMask, v.y, lane), __shfl_sync(syncMask, v.z, lane), __shfl_sync(syncMask, v.w, lane));
}
static __device__ __forceinline__
physx::PxVec3 warpShuffleMin(physx::PxVec3 v)
{
for (physx::PxU32 reductionRadius = 1; reductionRadius < WARP_SIZE; reductionRadius <<= 1)
{
v.x = fminf(v.x, __shfl_xor_sync(FULL_MASK, v.x, reductionRadius));
v.y = fminf(v.y, __shfl_xor_sync(FULL_MASK, v.y, reductionRadius));
v.z = fminf(v.z, __shfl_xor_sync(FULL_MASK, v.z, reductionRadius));
}
return v;
}
static __device__ __forceinline__
physx::PxVec3 warpShuffleMax(physx::PxVec3 v)
{
for (physx::PxU32 reductionRadius = 1; reductionRadius < WARP_SIZE; reductionRadius <<= 1)
{
v.x = fmaxf(v.x, __shfl_xor_sync(FULL_MASK, v.x, reductionRadius));
v.y = fmaxf(v.y, __shfl_xor_sync(FULL_MASK, v.y, reductionRadius));
v.z = fmaxf(v.z, __shfl_xor_sync(FULL_MASK, v.z, reductionRadius));
}
return v;
}
//// experimentally, seems more register-efficient to coalesce this
//static __device__ __forceinline__
//physx::PxReal shuffleDot(const physx::PxU32 syncMask, const physx::PxVec3& v0, int shuffle0, const physx::PxVec3& v1)
//{
// return __shfl_sync(syncMask, v0.x, shuffle0)*v1.x + __shfl_sync(syncMask, v0.y, shuffle0)*v1.y + __shfl_sync(syncMask, v0.z, shuffle0)*v1.z;
//}
//
//static __device__ __forceinline__
//physx::PxU32 maxIndex(physx::PxReal v, physx::PxU32 mask, physx::PxReal& maxV)
//{
// maxV = mask & (1 << threadIdx.x) ? v : -FLT_MAX;
//
// maxV = fmaxf(maxV, __shfl_xor_sync(FULL_MASK, maxV, 16));
// maxV = fmaxf(maxV, __shfl_xor_sync(FULL_MASK, maxV, 8));
// maxV = fmaxf(maxV, __shfl_xor_sync(FULL_MASK, maxV, 4));
// maxV = fmaxf(maxV, __shfl_xor_sync(FULL_MASK, maxV, 2));
// maxV = fmaxf(maxV, __shfl_xor_sync(FULL_MASK, maxV, 1));
//
// return lowestSetIndex(__ballot_sync(FULL_MASK, maxV == v)&mask);
//}
//
//static __device__ __forceinline__
//physx::PxU32 minIndex(physx::PxReal v, physx::PxU32 mask, physx::PxReal& minV)
//{
// minV = mask & (1 << threadIdx.x) ? v : FLT_MAX;
//
// minV = fminf(minV, __shfl_xor_sync(FULL_MASK, minV, 16));
// minV = fminf(minV, __shfl_xor_sync(FULL_MASK, minV, 8));
// minV = fminf(minV, __shfl_xor_sync(FULL_MASK, minV, 4));
// minV = fminf(minV, __shfl_xor_sync(FULL_MASK, minV, 2));
// minV = fminf(minV, __shfl_xor_sync(FULL_MASK, minV, 1));
//
// return lowestSetIndex(__ballot_sync(FULL_MASK, minV == v)&mask);
//}
#endif //SHUFFLE_CUH

View File

@@ -0,0 +1,98 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef __CU_SOLVER_ERROR_CUH__
#define __CU_SOLVER_ERROR_CUH__
#include "DyResidualAccumulator.h"
#include "atomic.cuh"
#include "reduction.cuh"
struct PxgErrorAccumulator
{
PxReal sumOfSquares;
PxU32 counter;
PxReal maxError;
PX_FORCE_INLINE __device__ PxgErrorAccumulator() : sumOfSquares(0.0f), counter(0), maxError(0.0f)
{ }
// Provides a calculateResidual function using fast GPU math instructions
static PX_FORCE_INLINE __device__ PxReal calculateResidual(PxReal deltaF, PxReal velocityMultiplier)
{
return velocityMultiplier == 0.0f ? 0.0f : __fdividef(deltaF, velocityMultiplier);
}
PX_FORCE_INLINE __device__ void accumulateErrorLocal(PxReal deltaF, PxReal velocityMultiplier)
{
PxReal e = PxgErrorAccumulator::calculateResidual(deltaF, velocityMultiplier);
sumOfSquares += e * e;
++counter;
maxError = PxMax(maxError, PxAbs(e));
}
PX_FORCE_INLINE __device__ void accumulateErrorLocal(PxReal deltaF0, PxReal deltaF1,
PxReal velocityMultiplier0, PxReal velocityMultiplier1)
{
accumulateErrorLocal(deltaF0, velocityMultiplier0);
accumulateErrorLocal(deltaF1, velocityMultiplier1);
}
/*PX_FORCE_INLINE __device__ void accumulateErrorGlobal(Dy::ErrorAccumulator& globalAccumulator)
{
atomicAdd(&globalAccumulator.mErrorSumOfSquares, sumOfSquares);
atomicAdd(&globalAccumulator.mCounter, counter);
if (maxError > globalAccumulator.mMaxError)
AtomicMax(&globalAccumulator.mMaxError, maxError);
}*/
PX_FORCE_INLINE __device__ void accumulateErrorGlobalNoAtomics(Dy::ErrorAccumulator& globalAccumulator)
{
globalAccumulator.mErrorSumOfSquares += sumOfSquares;
globalAccumulator.mCounter += counter;
if (maxError > globalAccumulator.mMaxError)
globalAccumulator.mMaxError = maxError;
}
PX_FORCE_INLINE __device__ void accumulateErrorGlobalFullWarp(Dy::ErrorAccumulator& globalAccumulator, PxU32 threadIndexInWarp)
{
PxReal s = warpReduction<AddOpPxReal, PxReal>(FULL_MASK, sumOfSquares);
PxU32 count = warpReduction<AddOpPxU32, PxU32>(FULL_MASK, counter);
PxReal maxErr = warpReduction<MaxOpFloat, PxReal>(FULL_MASK, maxError);
if (threadIndexInWarp == 0)
{
atomicAdd(&globalAccumulator.mErrorSumOfSquares, s);
atomicAdd(&globalAccumulator.mCounter, count);
if (maxErr > globalAccumulator.mMaxError)
AtomicMax(&globalAccumulator.mMaxError, maxErr);
}
}
};
#endif

View File

@@ -0,0 +1,175 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef __CU_UPDATE_CACHE_AND_BOUND_CUH__
#define __CU_UPDATE_CACHE_AND_BOUND_CUH__
#include "foundation/PxTransform.h"
#include "PxsTransformCache.h"
#include "PxgConvexConvexShape.h"
#include "geometry/PxGeometry.h"
namespace physx
{
static __device__ PxTransform getAbsPose(const PxTransform& body2World, const PxTransform& shape2Actor, const PxTransform& body2Actor)
{
PxTransform t0 = body2Actor.transformInv(shape2Actor);
return body2World.transform(t0);
}
static __device__ void setTransformCache(PxsCachedTransform* cacheArray, const PxTransform& transform, const PxU32 flags, const PxU32 index)
{
cacheArray[index].transform = transform;
cacheArray[index].flags = flags;
}
static __device__ PxVec3 basisExtent(const PxMat33& basis, const PxVec3& extent)
{
// extended basis vectors
const PxVec3 c0 = basis.column0 * extent.x;
const PxVec3 c1 = basis.column1 * extent.y;
const PxVec3 c2 = basis.column2 * extent.z;
PxVec3 w;
// find combination of base vectors that produces max. distance for each component = sum of abs()
w.x = PxAbs(c0.x) + PxAbs(c1.x) + PxAbs(c2.x);
w.y = PxAbs(c0.y) + PxAbs(c1.y) + PxAbs(c2.y);
w.z = PxAbs(c0.z) + PxAbs(c1.z) + PxAbs(c2.z);
return w;
}
static __device__ void updateBounds(const PxgShapeSim& shapeSim, const PxgShape* convexShapes, PxBounds3* boundsArray, const PxTransform& pose, const PxU32 index)
{
const PxBounds3& localBound = shapeSim.mLocalBounds;
PxBounds3& updatedBound = boundsArray[index];
switch (shapeSim.mShapeType)
{
case PxGeometryType::eSPHERE:
{
updatedBound.minimum = pose.p + localBound.minimum;
updatedBound.maximum = pose.p + localBound.maximum;
}
break;
case PxGeometryType::eCAPSULE:
{
const PxF32 radius = localBound.maximum.y;
const PxF32 halfHeight = localBound.maximum.x - radius;
const PxVec3 d = pose.q.getBasisVector0();
PxVec3 extents;
for (PxU32 ax = 0; ax < 3; ax++)
extents[ax] = PxAbs(d[ax]) * halfHeight + radius;
updatedBound.minimum = pose.p - extents;
updatedBound.maximum = pose.p + extents;
}
break;
case PxGeometryType::eBOX:
{
const PxVec3 halfExtents = localBound.maximum;
const PxVec3 extents = basisExtent(PxMat33(pose.q), halfExtents);
updatedBound.minimum = pose.p - extents;
updatedBound.maximum = pose.p + extents;
}
break;
case PxGeometryType::eCONVEXMESH:
{
const PxU32 hullIndex = shapeSim.mHullDataIndex;
if (hullIndex != 0xFFffFFff)
{
const PxgShape& shape = convexShapes[hullIndex];
PxMat33 rot(pose.q);
if (!shape.scale.isIdentity())
rot = rot * shape.scale.toMat33();
const PxU8* convexPtr = (PxU8*)shape.hullOrMeshPtr;
const uint4 tmp = *(((uint4*)convexPtr) + 1);
const float4* pVertices = reinterpret_cast<const float4*>(convexPtr + sizeof(uint4) + sizeof(float4) + sizeof(float4));
//const PxU32 polyData0_NbEdgesNbHullVerticesNbPolygons = tmp.x;
const PxU32 nbHullVertices = u16High(u32Low(tmp.x));//getNbHullVertices(polyData0_NbEdgesNbHullVerticesNbPolygons);
//PxU32 nb = shape.hullData->mNbHullVertices;
//const PxVec3* v = shape.hullData->getHullVertices();
PxVec3 minV = PxVec3(PX_MAX_F32);
PxVec3 maxV = PxVec3(-PX_MAX_F32);
for (PxU32 i = 0; i < nbHullVertices; ++i)
{
const float4 vf = pVertices[i];
const PxVec3 v = PxVec3(vf.x, vf.y, vf.z);
const PxVec3 vertexV = rot.transform(v);
minV = minV.minimum(vertexV);
maxV = maxV.maximum(vertexV);
}
//const Vec4V posV = Vec4V_From_Vec3V(V3LoadU(&pose.p.x));
maxV += pose.p;
minV += pose.p;
updatedBound.minimum = minV;
updatedBound.maximum = maxV;
}
else
{
//ML: this is for GPU incompatible type, which is hull vertices >64 and each hull polygon has vertices > 31
updatedBound = PxBounds3::transformFast(pose, localBound);
}
}
break;
default:
{
//This updates any dynamic meshes or HFs that may be attached to simulation shapes
updatedBound = PxBounds3::transformFast(pose, localBound);
}
break;
}
}
__device__ static inline void updateCacheAndBound(const PxTransform& absPos, const PxgShapeSim& shapeSim, PxU32 index,
PxsCachedTransform* cacheArray, PxBounds3* boundsArray, const PxgShape* shapes, bool isBP)
{
//TODO: port the transform flags
setTransformCache(cacheArray, absPos, 0, index);
if (isBP)
updateBounds(shapeSim, shapes, boundsArray, absPos, index);
}
}
#endif

View File

@@ -0,0 +1,234 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
#include "foundation/PxSimpleTypes.h"
#include "foundation/PxVec3.h"
#include "foundation/PxVec4.h"
#include "foundation/PxMat44.h"
#include "assert.h"
#include "utils.cuh"
#include "PxgInterpolation.h"
#include <stdio.h>
#include "PxDeformableSkinning.h"
#include "atomic.cuh"
using namespace physx;
extern "C" __host__ void initCommonKernels2() {}
extern "C" __global__ void interleaveBuffers(const float4* PX_RESTRICT vertices, const float4* PX_RESTRICT normals, PxU32 length, PxVec3* interleavedResultBuffer)
{
int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
if (threadIndex >= length)
return;
float4 v = vertices[threadIndex];
float4 n = normals[threadIndex];
interleavedResultBuffer[2 * threadIndex] = PxVec3(v.x, v.y, v.z);
interleavedResultBuffer[2 * threadIndex + 1] = PxVec3(n.x, n.y, n.z);
}
//A bit experimental. Can help to get smoother transition between triangles.
__device__ static PxVec3 modifyBarycentrics(PxVec3 bary)
{
//Use cubic basis function
bary.x = 3.0f * bary.x * bary.x - 2.0f * bary.x * bary.x * bary.x;
bary.y = 3.0f * bary.y * bary.y - 2.0f * bary.y * bary.y * bary.y;
bary.z = 3.0f * bary.z * bary.z - 2.0f * bary.z * bary.z * bary.z;
float sum = bary.x + bary.y + bary.z;
bary *= 1.0f / sum;
return bary;
}
//The paper https://perso.telecom-paristech.fr/boubek/papers/PhongTessellation/PhongTessellation.pdf uses alpha = 0.75 but a slightly lower value
//seems to be better. 0.5 is too low, so 0.625 (middle of 0.5 and 0.75) was chosen.
template<bool normalsAreNormalized = true>
__device__ static PxVec3 evaluatePointPhongInterpolation(const PxVec3& a, const PxVec3& b, const PxVec3& c, const PxVec3& uvw_,
const PxVec3& nA, const PxVec3& nB, const PxVec3& nC, PxReal halfSurfaceThickness, PxReal alpha = 0.625f)
{
PxVec3 uvw = false ? modifyBarycentrics(uvw_) : uvw_;
PxVec3 q = uvw.x * a + uvw.y * b + uvw.z * c;
PxReal scale1 = (q - a).dot(nA);
if (!normalsAreNormalized)
scale1 /= nA.magnitudeSquared();
PxVec3 projA = q - scale1 * nA;
PxReal scale2 = (q - b).dot(nB);
if (!normalsAreNormalized)
scale2 /= nB.magnitudeSquared();
PxVec3 projB = q - scale2 * nB;
PxReal scale3 = (q - c).dot(nC);
if (!normalsAreNormalized)
scale3 /= nC.magnitudeSquared();
PxVec3 projC = q - scale3 * nC;
//uvw = Pow(uvw, 1.5); //Experimental
PxVec3 qStar = uvw.x * projA + uvw.y * projB + uvw.z * projC;
PxVec3 dir = qStar - q;
PxReal offset = dir.normalizeSafe() * alpha;
//Asymptotic function applied to offset such that the magnitude of offset cannot exceed halfSurfaceThickness
PxReal ratio = 0.0f;
if (halfSurfaceThickness > 0.0f)
{
ratio = offset / halfSurfaceThickness;
ratio = tanhf(ratio); //Derivative at zero of tanh is one and tanh asymptotically reaches 1 - this is kind of a softMin(val, 1)
}
offset = ratio * halfSurfaceThickness;
return q + offset * dir;
}
extern "C" __global__
void normalVectorsAreaWeighted(
PxTrimeshSkinningGpuData* data)
{
PxTrimeshSkinningGpuData& d = data[blockIdx.y]; //TODO: Copy into shared memory
const PxU32 xDim = gridDim.x * blockDim.x;
const PxU32 loopEnd = d.nbGuideTriangles;
for (PxU32 threadIndex = blockIdx.x * blockDim.x + threadIdx.x; threadIndex < loopEnd; threadIndex += xDim)
{
const PxU32* tri = &d.guideTrianglesD[3 * threadIndex];
const PxVec3 p0 = d.guideVerticesD.at(tri[0]);
const PxVec3 p1 = d.guideVerticesD.at(tri[1]);
const PxVec3 p2 = d.guideVerticesD.at(tri[2]);
const PxVec3 n = (p1 - p0).cross(p2 - p0);
AtomicAdd3(d.guideNormalsD.atRef(tri[0]), n);
AtomicAdd3(d.guideNormalsD.atRef(tri[1]), n);
AtomicAdd3(d.guideNormalsD.atRef(tri[2]), n);
}
}
extern "C" __global__
void zeroNormals(
PxTrimeshSkinningGpuData* data)
{
PxTrimeshSkinningGpuData& d = data[blockIdx.y]; //TODO: Copy into shared memory
const PxU32 xDim = gridDim.x * blockDim.x;
const PxU32 loopEnd = d.guideVerticesD.count;
for (PxU32 threadIndex = blockIdx.x * blockDim.x + threadIdx.x; threadIndex < loopEnd; threadIndex += xDim)
{
d.guideNormalsD.atRef(threadIndex) = PxVec3(0.0f);
}
}
extern "C" __global__
void normalizeNormals(
PxTrimeshSkinningGpuData* data)
{
PxTrimeshSkinningGpuData& d = data[blockIdx.y]; //TODO: Copy into shared memory
const PxU32 xDim = gridDim.x * blockDim.x;
const PxU32 loopEnd = d.guideVerticesD.count;
for (PxU32 threadIndex = blockIdx.x * blockDim.x + threadIdx.x; threadIndex < loopEnd; threadIndex += xDim)
{
d.guideNormalsD.atRef(threadIndex).normalizeSafe();
}
}
extern "C" __global__
void interpolateSkinnedClothVertices(
PxTrimeshSkinningGpuData* data)
{
PxTrimeshSkinningGpuData& d = data[blockIdx.y]; //TODO: Copy into shared memory
const PxU32 xDim = gridDim.x * blockDim.x;
const PxU32 loopEnd = d.skinnedVerticesD.count;
for (PxU32 threadIndex = blockIdx.x * blockDim.x + threadIdx.x; threadIndex < loopEnd; threadIndex += xDim)
{
PxTriangleMeshEmbeddingInfo info = d.skinningInfoPerVertexD[threadIndex];
const PxU32* tri = &d.guideTrianglesD[3 * info.guideTriangleId];
PxReal w = 1.0f - info.uv.x - info.uv.y;
PxVec3 uvw(info.uv.x, info.uv.y, w);
PxVec3 uvwProj = uvw.maximum(PxVec3(0.0));
PxReal sumProj = uvwProj.x + uvwProj.y + uvwProj.z;
if(sumProj > 0.0f)
{
uvwProj *= 1.0f / sumProj;
}
PxVec3 nA = d.guideNormalsD.at(tri[0]);
PxVec3 nB = d.guideNormalsD.at(tri[1]);
PxVec3 nC = d.guideNormalsD.at(tri[2]);
PxVec3 normal = uvwProj.x * nA + uvwProj.y * nB + uvwProj.z * nC;
normal.normalizeSafe();
PxVec3 pointPhong = evaluatePointPhongInterpolation(
d.guideVerticesD.at(tri[0]),
d.guideVerticesD.at(tri[1]),
d.guideVerticesD.at(tri[2]),
uvwProj, nA, nB, nC, d.halfSurfaceThickness);
PxVec3 pointUVW = uvw.x * d.guideVerticesD.at(tri[0]) + uvw.y * d.guideVerticesD.at(tri[1]) + uvw.z * d.guideVerticesD.at(tri[2]);
PxVec3 pointUVWProj = uvwProj.x * d.guideVerticesD.at(tri[0]) + uvwProj.y * d.guideVerticesD.at(tri[1]) + uvwProj.z * d.guideVerticesD.at(tri[2]);
//The offset could also be used to modify the alpha factor of the method EvaluatePoint. Or one could introduce an offset to EvaluatePoint that offsets along the same direction as the alpha factor
PxVec3 offsetPoint = pointPhong + info.offsetAlongInterpolatedNormal * normal + pointUVW - pointUVWProj;
d.skinnedVerticesD.atRef(threadIndex) = offsetPoint;
}
}
extern "C" __global__
void interpolateSkinnedSoftBodyVertices(
PxTetmeshSkinningGpuData* data)
{
PxTetmeshSkinningGpuData& d = data[blockIdx.y]; //TODO: Copy into shared memory
const PxU32 xDim = gridDim.x * blockDim.x;
const PxU32 loopEnd = d.skinnedVerticesD.count;
for (PxU32 threadIndex = blockIdx.x * blockDim.x + threadIdx.x; threadIndex < loopEnd; threadIndex += xDim)
{
//Uses linear barycentric interpolation - plenty of room for improvements
PxTetrahedronMeshEmbeddingInfo info = d.skinningInfoPerVertexD[threadIndex];
const PxU32* tet = &d.guideTetrahedraD[4 * info.guideTetrahedronId];
PxReal s = 1.0f - info.uvw.x - info.uvw.y - info.uvw.z;
PxVec3 point =
info.uvw.x * d.guideVerticesD.at(tet[0]) +
info.uvw.y * d.guideVerticesD.at(tet[1]) +
info.uvw.z * d.guideVerticesD.at(tet[2]) +
s * d.guideVerticesD.at(tet[3]);
d.skinnedVerticesD.atRef(threadIndex) = point;
}
}

View File

@@ -0,0 +1,55 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef __CU_UTILS_CUH__
#define __CU_UTILS_CUH__
#include "foundation/PxVec3.h"
#include "foundation/PxVec4.h"
namespace physx
{
__device__ PX_FORCE_INLINE PxVec3 PxLoad3(const float4& v) { float4 tmp = v; return PxVec3(tmp.x, tmp.y, tmp.z); }
__device__ PX_FORCE_INLINE PxVec3 PxLoad3(const float4& v, float& w) { float4 tmp = v; w = tmp.w; return PxVec3(tmp.x, tmp.y, tmp.z); }
__device__ PX_FORCE_INLINE PxVec4 PxLoad4(const float4& v) { float4 tmp = v; return PxVec4(tmp.x, tmp.y, tmp.z, tmp.w); }
__device__ PX_FORCE_INLINE float4 PxSave3(const PxVec3& v) { return float4({ v.x, v.y, v.z, 0 }); }
__device__ PX_FORCE_INLINE float4 PxSave4(const PxVec4& v) { return float4({ v.x, v.y, v.z, v.w }); }
//Only works if val > 0
__device__ PX_FORCE_INLINE int lowestSetIndex(int val) { return __ffs(val) - 1; }
__device__ PX_FORCE_INLINE int highestSetIndex(int val) { return 31 - __clz(val); }
__device__ PX_FORCE_INLINE int lowestSetBit(int val) { return val & -val; }
__device__ PX_FORCE_INLINE bool testBit(int map, int index) { return (map & 1 << index) != 0; }
//Returns the index of the lowest set bit. Returns 0xFFffFFff is not bit is set
__device__ PX_FORCE_INLINE PxU32 lowestSetIndex(PxU32 val) { return __ffs(val) - 1; }
__device__ PX_FORCE_INLINE PxU32 clearLowestSetBit(PxU32 val) { return val & (val - 1); }
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,48 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#include "PxgBroadPhase.h"
namespace physx
{
extern "C" void initCommonKernels0();
extern "C" void initCommonKernels1();
extern "C" void initCommonKernels2();
void createPxgCommon()
{
#if !PX_PHYSX_GPU_EXPORTS
//this call is needed to force PhysXCommonGpu linkage as Static Library!
initCommonKernels0();
initCommonKernels1();
initCommonKernels2();
#endif
}
}

View File

@@ -0,0 +1,143 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#include <stdio.h>
#include "CudaKernelWrangler.h"
#include "PxgCopyManager.h"
#include "cudamanager/PxCudaContextManager.h"
#include "PxgKernelIndices.h"
#include "PxgHeapMemAllocator.h"
#include "PxgCudaUtils.h"
#include "PxgCommonDefines.h"
#include "cudamanager/PxCudaContext.h"
#define DEBUG_COPY_MANAGER 0
using namespace physx;
PxgCopyManager::PxgCopyManager(PxgHeapMemoryAllocatorManager* heapMemoryManager) :
mDescriptorsQueue(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators)),
mNumDescriptors(0),
mFinishedEvent(0),
mEventRecorded(false),
mHeapMemoryManager(heapMemoryManager)
{
}
void PxgCopyManager::createFinishedEvent(PxCudaContext* cudaContext)
{
cudaContext->eventCreate(&mFinishedEvent, CU_EVENT_DEFAULT);
}
void PxgCopyManager::destroyFinishedEvent(PxCudaContext* cudaContext)
{
cudaContext->eventDestroy(mFinishedEvent);
}
void PxgCopyManager::pushDeferredHtoD(const CopyDesc& desc)
{
PxU32 newSize = (mNumDescriptors + 1) * sizeof(CopyDesc);
newSize = (newSize + 255) & ~255; //round up to ensure 256-bytes alignment of the following array
newSize += (mNumDescriptors + 1) * sizeof(PxU32); //run-sum array
if (newSize > mDescriptorsQueue.size())
{
mDescriptorsQueue.resize(newSize * 2);
}
CopyDesc* descsCPU = reinterpret_cast<CopyDesc*>(mDescriptorsQueue.begin());
descsCPU[mNumDescriptors++] = desc;
}
bool PxgCopyManager::hasFinishedCopying(PxCudaContext* cudaContext) const
{
CUresult result = cudaContext->eventQuery(mFinishedEvent);
PX_ASSERT(result == CUDA_SUCCESS || result == CUDA_ERROR_NOT_READY);
return result != CUDA_ERROR_NOT_READY;
}
void PxgCopyManager::waitAndReset(PxCudaContext* cudaContext)
{
if(mEventRecorded)
{
CUresult result = cudaContext->eventSynchronize(mFinishedEvent);
PX_UNUSED(result);
PX_ASSERT(result == CUDA_SUCCESS);
}
resetUnsafe();
}
void PxgCopyManager::dispatchCopy(CUstream stream, PxCudaContextManager* cudaContextManager, KernelWrangler* kernelWrangler)
{
PxCudaContext* cudaContext = cudaContextManager->getCudaContext();
PX_ASSERT(hasFinishedCopying(cudaContext));
PxU32 numDescs = mNumDescriptors;
mEventRecorded = false;
if (!numDescs)
return;
PxU32 numWarpsPerBlock = COPY_KERNEL_WARPS_PER_BLOCK;
PxU32 numBlocks = numDescs;
PxU32 numExtraShared = cudaContextManager->supportsArchSM30() ? 0 : numWarpsPerBlock * WARP_SIZE * sizeof(PxU32);
CUfunction kernelFunction = kernelWrangler->getCuFunction(PxgKernelIds::MEM_COPY_BALANCED_KERNEL);
{
CopyDesc* descsGPU = reinterpret_cast<CopyDesc*>(getMappedDevicePtr(cudaContext, mDescriptorsQueue.begin()));
PxCudaKernelParam kernelParams[] =
{
PX_CUDA_KERNEL_PARAM(descsGPU),
PX_CUDA_KERNEL_PARAM(numDescs)
};
CUresult result = cudaContext->launchKernel(kernelFunction, numBlocks, 1, 1, WARP_SIZE, numWarpsPerBlock, 1, numExtraShared, stream, kernelParams, sizeof(kernelParams), 0, PX_FL);
if(result != CUDA_SUCCESS)
printf("GPU MemCopyBalanced fail to launch kernel!!\n");
#if DEBUG_COPY_MANAGER
result = cudaContext->streamSynchronize(stream);
if (result != CUDA_SUCCESS)
printf("GPU MemCopyBalanced died!!\n");
#endif
}
CUresult result = cudaContext->eventRecord(mFinishedEvent, stream);
mEventRecorded = true;
PX_UNUSED(result);
PX_ASSERT(result == CUDA_SUCCESS);
}

View File

@@ -0,0 +1,164 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#include "PxgCudaBuffer.h"
#include "foundation/PxMath.h"
#include "foundation/PxAssert.h"
#include "cudamanager/PxCudaContext.h"
#include "PxPhysXGpu.h"
#include "PxsKernelWrangler.h"
#include "common/PxPhysXCommonConfig.h"
#include "PxgMemoryManager.h"
#define MEMCHECK_SUPPORT 0
using namespace physx;
void PxgCudaBuffer::allocate(const PxU64 size, const char* filename, PxI32 line)
{
PX_ASSERT(mHeapMemoryAllocator);
if (mSize < size)
{
if (mSize > 0 && mPtr)
{
#if MEMCHECK_SUPPORT
PX_UNUSED(filename);
PX_UNUSED(line);
PxgCudaAllocatorCallbackBase* alloc = reinterpret_cast<PxgCudaAllocatorCallbackBase*>(mHeapMemoryAllocator->getAllocator());
PxU8* ptr = reinterpret_cast<PxU8*>(mPtr);
alloc->mContextManager->freeDeviceBuffer(ptr);
mPtr = NULL;
#else
mHeapMemoryAllocator->deallocate(reinterpret_cast<void*>(mPtr));
#endif
}
//Allocate either double current size or the requested size, depending on which is larger
mSize = PxMax(size, mSize * 2);
#if MEMCHECK_SUPPORT
PxgCudaAllocatorCallbackBase* alloc = reinterpret_cast<PxgCudaAllocatorCallbackBase*>(mHeapMemoryAllocator->getAllocator());
mPtr = CUdeviceptr(alloc->mContextManager->allocDeviceBuffer<PxU8>(PxU32(mSize)));
PX_ASSERT(mPtr);
#else
mPtr = reinterpret_cast<CUdeviceptr>(mHeapMemoryAllocator->allocate(mSize, mStatGroup, filename, line));
#endif
#if PX_STOMP_ALLOCATED_MEMORY
if (mPtr)
{
PxgCudaAllocatorCallbackBase* alloc = reinterpret_cast<PxgCudaAllocatorCallbackBase*>(mHeapMemoryAllocator->getAllocator());
PxCudaContextManager* ccm = alloc->mContextManager;
if (ccm)
{
PxScopedCudaLock scl(*ccm);
CUresult result = ccm->getCudaContext()->memsetD8(mPtr, static_cast<unsigned char>(0xcd), mSize);
if (result != 0)
PX_ASSERT(result == 0);
}
else
{
PxGetFoundation().error(physx::PxErrorCode::eDEBUG_WARNING, PX_FL,
"Not possible to stomp PxgCudaBufferMemory because not cuda context manager is available.");
}
}
#endif
}
}
void PxgCudaBuffer::allocateCopyOldDataAsync(const PxU64 size, PxCudaContext* cudaContext, CUstream stream, const char* filename, PxI32 line)
{
PX_ASSERT(mHeapMemoryAllocator);
PxU64 oldSize = mSize;
//Allocate either double current size or the requested size, depending on which is larger
mSize = (oldSize < size) ? PxMax(size, mSize * 2) : mSize;
if (oldSize < size)
{
CUdeviceptr oldPtr = mPtr;
#if MEMCHECK_SUPPORT
PX_UNUSED(filename);
PX_UNUSED(line);
PxgCudaAllocatorCallbackBase* alloc = reinterpret_cast<PxgCudaAllocatorCallbackBase*>(mHeapMemoryAllocator->getAllocator());
mPtr = CUdeviceptr(alloc->mContextManager->allocDeviceBuffer<PxU8>(PxU32(mSize)));
PX_ASSERT(mPtr);
#else
mPtr = reinterpret_cast<CUdeviceptr>(mHeapMemoryAllocator->allocate(mSize, mStatGroup, filename, line));
#endif
if (oldSize > 0 && oldPtr)
{
cudaContext->memcpyDtoDAsync(mPtr, oldPtr, oldSize, stream);
//Defer deletion. This makes sure nothing else gets this memory until after the memcopy has completed
#if MEMCHECK_SUPPORT
//Since MEMCHECK_SUPPORT is only active for invalid memory access debugging, let it leak for now
#else
mHeapMemoryAllocator->deallocateDeferred(reinterpret_cast<void*>(oldPtr));
#endif
}
}
}
void PxgCudaBuffer::deallocate()
{
PX_ASSERT(mHeapMemoryAllocator);
if (mSize && mPtr)
{
#if MEMCHECK_SUPPORT
PxgCudaAllocatorCallbackBase* alloc = reinterpret_cast<PxgCudaAllocatorCallbackBase*>(mHeapMemoryAllocator->getAllocator());
PxU8* ptr = reinterpret_cast<PxU8*>(mPtr);
alloc->mContextManager->freeDeviceBuffer(ptr);
#else
mHeapMemoryAllocator->deallocate(reinterpret_cast<void*>(mPtr));
#endif
mPtr = 0;
mSize = 0;
}
}
void PxgCudaBuffer::deallocateDeferred()
{
#if MEMCHECK_SUPPORT
//Since MEMCHECK_SUPPORT is only active for invalid memory access debugging, let it leak for now
#else
PX_ASSERT(mHeapMemoryAllocator);
if (mSize && mPtr)
mHeapMemoryAllocator->deallocateDeferred(reinterpret_cast<void*>(mPtr));
#endif
}
PxgCudaBuffer::~PxgCudaBuffer()
{
deallocate();
}

View File

@@ -0,0 +1,244 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#include "PxgCudaMemoryAllocator.h"
#include "foundation/PxErrors.h"
#include "foundation/PxMath.h"
#include "foundation/PxPreprocessor.h"
#if PX_LINUX && PX_CLANG
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wdocumentation"
#pragma clang diagnostic ignored "-Wdisabled-macro-expansion"
#endif
#include <cuda.h>
#if PX_LINUX && PX_CLANG
#pragma clang diagnostic pop
#endif
#include "foundation/PxAllocator.h"
#include "foundation/PxAtomic.h"
#include "foundation/PxAssert.h"
#include "cudamanager/PxCudaContextManager.h"
#include "cudamanager/PxCudaContext.h"
#include "common/PxPhysXCommonConfig.h"
using namespace physx;
// memory tracking.
#if PX_DEBUG
#include "PxgMemoryTracker.h"
static MemTracker deviceMemTracker;
static MemTracker hostMemTracker;
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
void* physx::PxgPinnedMemoryAllocate(PxCudaContext& cudaContext, size_t size, const char* filename, PxI32 line)
{
PxU8* ptr = NULL;
CUresult result = cudaContext.memHostAlloc((void**)&ptr, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_PORTABLE);
if (result != CUDA_SUCCESS || !ptr)
{
PxGetFoundation().error(PX_WARN, PX_FL, "Failed to allocate pinned memory.");
return NULL;
}
PX_ASSERT((size_t)(ptr) % 256 == 0); //alignment check. I believe it should be guaranteed
#if PX_STOMP_ALLOCATED_MEMORY
// fill pinned memory with markers to catch uninitialized memory earlier.
// use alternating pattern to avoid pairs of start, end values to cancel each other out.
PxU32 pat[2] = { 0xcdcdcdcd, 0xdcdcdcdc };
for (size_t i = 0; i < (size/4); ++i)
reinterpret_cast<PxU32*>(ptr)[i] = pat[i % 2];
#endif
#if PX_DEBUG
hostMemTracker.registerMemory(ptr, false, size, filename, line);
#else
PX_UNUSED(filename);
PX_UNUSED(line);
#endif
return ptr;
}
void physx::PxgPinnedMemoryDeallocate(PxCudaContext& cudaContext, void* ptr)
{
if (ptr == NULL)
return;
CUresult result = cudaContext.memFreeHost(ptr);
PX_UNUSED(result);
PX_ASSERT(result == CUDA_SUCCESS);
#if PX_DEBUG
hostMemTracker.unregisterMemory(ptr, false);
#endif
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
void* physx::PxgCudaDeviceMemoryAllocate(PxCudaContext& cudaContext, size_t size, const char* filename, PxI32 line)
{
if (cudaContext.isInAbortMode())
return NULL;
PxDeviceAllocatorCallback* callback = cudaContext.getAllocatorCallback();
if (callback)
{
void* ptr = NULL;
bool result = callback->memAlloc(&ptr, size);
if (!result)
{
cudaContext.setAbortMode(true);
PxGetFoundation().error(PxErrorCode::eOUT_OF_MEMORY, PX_FL, "PxDeviceAllocatorCallback failed to allocate memory %zu bytes!", size);
return NULL;
}
#if PX_DEBUG
if (result)
deviceMemTracker.registerMemory(ptr, true, size, filename, line);
#else
PX_UNUSED(filename);
PX_UNUSED(line);
#endif
return ptr;
}
else
{
CUdeviceptr ptr = 0;
CUresult result = cudaContext.memAlloc(&ptr, size);
PX_ASSERT((ptr & 127) == 0);
if (result != CUDA_SUCCESS)
{
cudaContext.setAbortMode(true);
PxGetFoundation().error(PxErrorCode::eOUT_OF_MEMORY, PX_FL, "PxgCudaDeviceMemoryAllocator failed to allocate memory %zu bytes! Result = %i", size, result);
return NULL;
}
#if PX_DEBUG
if (result == CUDA_SUCCESS)
deviceMemTracker.registerMemory(reinterpret_cast<void*>(ptr), true, size, filename, line);
#else
PX_UNUSED(filename);
PX_UNUSED(line);
#endif
return reinterpret_cast<void*>(ptr);
}
}
void physx::PxgCudaDeviceMemoryDeallocate(PxCudaContext& cudaContext, void* ptr)
{
PxDeviceAllocatorCallback* callback = cudaContext.getAllocatorCallback();
if (callback)
{
bool result = callback->memFree(ptr);
if (!result)
PxGetFoundation().error(PX_WARN, PX_FL, "PxDeviceAllocatorCallback fail to deallocate memory!!\n");
}
else
{
CUresult result = cudaContext.memFree(reinterpret_cast<CUdeviceptr>(ptr));
if (result != CUDA_SUCCESS)
PxGetFoundation().error(PX_WARN, PX_FL, "PxgCudaDeviceMemoryDeallocate fail to deallocate memory!! Result = %i\n", result);
}
#if PX_DEBUG
if (ptr)
deviceMemTracker.unregisterMemory(ptr, true);
#endif
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
PxgPinnedHostLinearMemoryAllocator::PxgPinnedHostLinearMemoryAllocator(PxCudaContextManager* contextManager, const PxU64 size) :
mCudaContext(contextManager->getCudaContext())
{
reserve(size);
}
PxgPinnedHostLinearMemoryAllocator::~PxgPinnedHostLinearMemoryAllocator()
{
deallocate();
}
void PxgPinnedHostLinearMemoryAllocator::reserveAndGrow(const PxU64 size)
{
// only reallocate when the new size is larger than what we had before.
if (size > mTotalSize)
{
deallocate();
const PxU64 newSize = PxMax(size, PxU64(PxCeil(mTotalSize * 1.5f)));
mStart = reinterpret_cast<PxU8*>(PxgPinnedMemoryAllocate(*mCudaContext, newSize, PX_FL));
mTotalSize = newSize;
mCurrentSize = 0;
}
}
void PxgPinnedHostLinearMemoryAllocator::reserve(const PxU64 size)
{
PX_COMPILE_TIME_ASSERT(sizeof(size_t) == sizeof(PxU64));
mStart = reinterpret_cast<PxU8*>(PxgPinnedMemoryAllocate(*mCudaContext, size, PX_FL));
mTotalSize = size;
mCurrentSize = 0;
}
void PxgPinnedHostLinearMemoryAllocator::reset()
{
mCurrentSize = 0;
}
void* PxgPinnedHostLinearMemoryAllocator::allocate(const PxU64 size, const PxU64 alignment)
{
if(size > 0)
{
const PxI64 alignedSize = PxI64(size + alignment);
PxU64 baseOffset = PxU64(physx::PxAtomicAdd(reinterpret_cast<PxI64*>(&mCurrentSize), alignedSize));
if (baseOffset > mTotalSize)
{
PxGetFoundation().error(PxErrorCode::eOUT_OF_MEMORY, PX_FL, "PxgPinnedHostLinearMemoryAllocator: overflowing initial allocation size, increase capacity to at least %u\n", baseOffset);
return NULL;
}
// this takes baseOffset again because of the atomic.
uintptr_t startAddress = (uintptr_t(mStart)) + (baseOffset - alignedSize);
startAddress = (startAddress + alignment-1) & (~(alignment - 1));
return (void*)startAddress;
}
return NULL;
}
void PxgPinnedHostLinearMemoryAllocator::deallocate()
{
if(mTotalSize && mStart)
PxgPinnedMemoryDeallocate(*mCudaContext, mStart);
}

View File

@@ -0,0 +1,525 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#include "PxgHeapMemAllocator.h"
#include "foundation/PxAllocator.h"
#include "foundation/PxMath.h"
#include "common/PxProfileZone.h"
#include "PxsMemoryManager.h"
using namespace physx;
#define EXCEPTIONAL_ALLOC_FACTOR 2
bool Block::isValid()
{
BlockHeader* current = mStartHeader;
while (current)
{
BlockHeader* next = current->mNext;
if (next)
{
if ((current->mRootIndex > next->mRootIndex) || ((current->mRootIndex == next->mRootIndex) && (current->mOffset >= next->mOffset)))
{
return false;
}
else
{
current = next;
next = current->mNext;
}
}
else
{
current = NULL;
}
}
return true;
}
void Block::insertBlockHeader(const PxU32 rootIndex, const PxU32 offset, PxPool<BlockHeader>& pool)
{
PX_PROFILE_ZONE("Block::insertBlockHeader", 0);
BlockHeader* newHeader = pool.allocate();
newHeader->initialize(rootIndex, offset);
if (mStartHeader)
{
BlockHeader* header = mStartHeader;
while (header && ((header->mRootIndex < rootIndex) || (header->mRootIndex == rootIndex && header->mOffset < offset)))
{
header = header->mNext;
}
//if we found a header, we need to insert a new header in front of the found header
if (header)
{
BlockHeader* prevHeader = header->mPrev;
newHeader->mNext = header;
newHeader->mPrev = prevHeader;
if (prevHeader)
{
prevHeader->mNext = newHeader;
}
else
{
mStartHeader = newHeader;
}
header->mPrev = newHeader;
}
else
{
//if we didn't found an appropriated header, we need to insert this new header at the end of the linked list
mEndHeader->mNext = newHeader;
newHeader->mPrev = mEndHeader;
mEndHeader = newHeader;
}
}
else
{
mStartHeader = newHeader;
mEndHeader = newHeader;
}
PX_ASSERT(isValid());
mHeaderSizes++;
}
void Block::removeBlockHeader(BlockHeader* header, PxPool<BlockHeader>& pool)
{
BlockHeader* mPrev = header->mPrev;
BlockHeader* mNext = header->mNext;
if (mPrev)
mPrev->mNext = mNext;
else
mStartHeader = mNext;
if (mNext)
mNext->mPrev = mPrev;
else
mEndHeader = mPrev;
pool.deallocate(header);
mHeaderSizes--;
}
BlockHeader* Block::findBuddy(const PxU32 offsetToFind, const PxU32 rootIndex)
{
BlockHeader* header = mStartHeader;
while (header && (header->mOffset != offsetToFind || header->mRootIndex != rootIndex))
{
header = header->mNext;
}
return header;
}
PxgHeapMemoryAllocator::PxgHeapMemoryAllocator(const PxU32 byteSize, PxVirtualAllocatorCallback* allocator) : mBlockHeaderPool(PxAllocatorTraits<BlockHeader>::Type(), 128)
{
PX_ASSERT(PxIsPowerOfTwo(byteSize));
PX_ASSERT(byteSize >= 128);
mAllocationSize = byteSize;
mAllocator = allocator;
PX_PROFILE_ZONE("PxgHeapMemoryAllocator::initialization", 0);
void* memory = mAllocator->allocate(mAllocationSize, 0, PX_FL);
// AD: the allocation above can fail.
if (memory)
{
mRoots.pushBack(memory);
mTotalMem = mAllocationSize;
initializeBlocks(0);
}
else
{
mTotalMem = 0;
mBitfield = 0;
}
}
PxgHeapMemoryAllocator::~PxgHeapMemoryAllocator()
{
if (mAllocator)
{
for (PxU32 i = 0; i < mRoots.size(); ++i)
{
mAllocator->deallocate(mRoots[i]);
}
for (PxU32 i = 0; i < mExceptionalAllocs.size(); ++i)
{
if(mExceptionalAllocs[i].address)
mAllocator->deallocate(mExceptionalAllocs[i].address);
}
mRoots.clear();
mExceptionalAllocs.clear();
mAllocator = NULL;
}
}
void PxgHeapMemoryAllocator::initializeBlocks(const PxU32 rootIndex)
{
//calculate how many slots do we need, the smallest blockSize in a slot will be 128 byte. the 120 = pow(2, 7)
const PxU32 highestBit = PxHighestSetBit(mAllocationSize) + 1 - 7;
mBlocks.resize(highestBit);
//initialize all blocks
for (PxU32 i = 0; i < highestBit; ++i)
{
mBlocks[i].mBlockSize = 1u << (i + 7u);
mBlocks[i].mBlockIndex = i;
}
//all blocks are empty beside the highestBit
mBitfield = (1u << (highestBit - 1u));
mBlocks[highestBit - 1].insertBlockHeader(rootIndex, 0, mBlockHeaderPool);
}
PxU32 PxgHeapMemoryAllocator::getNextFreeBlock(const PxU32 blockIndex, const PxU32 allocationSize, const char* file, const int line)
{
PX_ASSERT(PxIsPowerOfTwo(allocationSize));
const PxU32 bits = mBitfield & (~((1 << blockIndex) - 1));
//no bigger slot avaiable
if (bits == 0)
{
PX_PROFILE_ZONE("PxgHeapMemoryAllocator::getNextFreeBlock", 0);
//we can't find any free blocks, we allocate more memory
const PxU32 maxAllocationSize = PxMax(allocationSize, mAllocationSize);
void* memorys = mAllocator->allocate(maxAllocationSize, 0, file, line);
if (!memorys)
return PXG_INVALID_BLOCK;
mRoots.pushBack(memorys);
mTotalMem += maxAllocationSize;
//if the allocationSize is bigger than the default allocation size(mAllocationSize), we need to increase
//the block slots
if (blockIndex >= mBlocks.size())
{
PxU32 oldSize = mBlocks.size();
mBlocks.resize(blockIndex + 1);
for (PxU32 i = oldSize; i <= blockIndex; ++i)
{
//blockSize is power of two
mBlocks[i].mBlockSize = 1u << (i + 7u);
mBlocks[i].mBlockIndex = i;
}
}
const PxU32 newBlockIndex = PxU32(PxMax(PxI32(PxHighestSetBit(maxAllocationSize)) - 7, 0));
const PxU32 rootIndex = mRoots.size() - 1;
Block* block = &mBlocks[newBlockIndex];
block->insertBlockHeader(rootIndex, 0, mBlockHeaderPool);
mBitfield = mBitfield | (1u << newBlockIndex);
return newBlockIndex;
}
else
{
return PxLowestSetBit(bits);
}
}
void* PxgHeapMemoryAllocator::allocate(const size_t byteSize, const int group, const char* file, const int line)
{
if (byteSize == 0)
return NULL;
PX_PROFILE_ZONE("PxgHeapMemoryAllocator::allocate", 0);
PxMutex::ScopedLock myLock(mMutex);
PX_ASSERT(group >= 0 && group < PxsHeapStats::eHEAPSTATS_COUNT);
mHeapStats.stats[group] += byteSize;
if ((byteSize * EXCEPTIONAL_ALLOC_FACTOR) > mAllocationSize)
{
PX_PROFILE_ZONE("PxgHeapMemoryAllocator::exceptionalAlloc", 0);
//We are allocating over half the size of a page. In this case, we'll use a whole page so we might
//as well just allocate an exceptional block for this using the built-in allocator...
void* memorys = mAllocator->allocate(byteSize, 0, file, line);
if (!memorys)
return NULL;
mTotalMem += byteSize;
PxU32 index = mExceptionalAllocs.size();
ExceptionalAlloc alloc;
alloc.address = memorys;
alloc.size = byteSize;
mExceptionalAllocs.pushBack(alloc);
mHashMap.insert(memorys, AllocationValue(PXG_INVALID_BLOCK, index, byteSize, group));
#if PX_DEBUG
mMemTracker.registerMemory(reinterpret_cast<void*>(memorys), true, byteSize, file, line);
#endif
return memorys;
}
const PxU32 maxSize = PxIsPowerOfTwo(PxU32(byteSize)) ? PxU32(byteSize) : PxNextPowerOfTwo(PxU32(byteSize));
//get the slot index
const PxU32 blockIndex = PxU32(PxMax(PxI32(PxHighestSetBit(maxSize)) - 7, 0));
//Reserve enough memory for this block if it is needed
const PxU32 freeBlockIndex = getNextFreeBlock(blockIndex, maxSize, file, line);
// if the allocation of the free block failed, make sure we pass the error along.
if (freeBlockIndex == PXG_INVALID_BLOCK)
return NULL;
if (mBlocks[blockIndex].isEmpty())
{
//We don't have a slot of this size, so recursively split higher blocks until we get to the desired size.
//The above getNextFreeBlock(...) call will ensure that there is a suitable block to use.
Block& tBlock = mBlocks[blockIndex];
Block* freeBlock = &mBlocks[freeBlockIndex];
PxU32 cBlockSize = freeBlock->mBlockSize;
//remove the last free header
BlockHeader* newBlockHeader = freeBlock->getFreeBlocks();
const PxU32 rootIndex = newBlockHeader->mRootIndex;
const PxU32 offset = newBlockHeader->mOffset;
freeBlock->removeBlockHeader(newBlockHeader, mBlockHeaderPool);
if (freeBlock->isEmpty())
{
mBitfield = mBitfield & (~(1u << freeBlockIndex));
}
void* freeAddress = reinterpret_cast<void*>(reinterpret_cast<PxU8*>(mRoots[rootIndex]) + offset);
PX_ASSERT(!mHashMap.find(freeAddress));
mBlocks[blockIndex].insertBlockHeader(rootIndex, tBlock.mBlockSize + offset, mBlockHeaderPool);
mBitfield = mBitfield | (1u << blockIndex);
mHashMap.insert(freeAddress, AllocationValue(blockIndex, rootIndex, byteSize, group));
//recursively split blocks
PxU32 cOffset = offset;
PxU32 cBlockIndex = freeBlock->mBlockIndex;
const PxU32 tBlockSize = tBlock.mBlockSize << 1;
while (cBlockSize > tBlockSize)
{
cBlockSize = cBlockSize >> 1;
cOffset = cBlockSize + offset;
cBlockIndex = cBlockIndex - 1;
mBlocks[cBlockIndex].insertBlockHeader(rootIndex, cOffset, mBlockHeaderPool);
mBitfield = mBitfield | (1u << cBlockIndex);
}
#if PX_DEBUG
mMemTracker.registerMemory(reinterpret_cast<void*>(freeAddress), true, byteSize, file, line);
#endif
return freeAddress;
}
else
{
Block& tBlock = mBlocks[blockIndex];
BlockHeader* newHeader = tBlock.getFreeBlocks();
const PxU32 rootIndex = newHeader->mRootIndex;
const PxU32 offset = newHeader->mOffset;
tBlock.removeBlockHeader(newHeader, mBlockHeaderPool);
if (tBlock.isEmpty())
{
mBitfield = mBitfield & (~(1u << blockIndex));
}
void* address = reinterpret_cast<void*>(reinterpret_cast<PxU8*>(mRoots[rootIndex]) + offset);
PX_ASSERT(!mHashMap.find(address));
mHashMap.insert(address, AllocationValue(blockIndex, rootIndex, byteSize, group));
#if PX_DEBUG
mMemTracker.registerMemory(reinterpret_cast<void*>(address), true, byteSize, file, line);
#endif
return address;
}
}
void PxgHeapMemoryAllocator::deallocateDeferred(void* ptr)
{
deferredDeallocs.pushBack(ptr);
}
void PxgHeapMemoryAllocator::flushDeferredDeallocs()
{
for (PxU32 i = 0; i < deferredDeallocs.size(); ++i)
deallocate(deferredDeallocs[i]);
deferredDeallocs.forceSize_Unsafe(0);
}
void PxgHeapMemoryAllocator::deallocate(void* ptr)
{
PX_PROFILE_ZONE("PxgHeapMemoryAllocator::deallocate", 0);
if (ptr == NULL)
return;
PxMutex::ScopedLock myLock(mMutex);
PX_ASSERT(mHashMap.find(ptr));
//found the block index
AllocationValue value = mHashMap.find(ptr)->second;
mHeapStats.stats[value.mGroup] -= value.mByteSize;
mHashMap.erase(ptr);
if (value.mBlockIndex == PXG_INVALID_BLOCK)
{
//Exceptional allocation, we just release it back to the CUDA allocator...
mTotalMem -= mExceptionalAllocs[value.mRootIndex].size;
mExceptionalAllocs[value.mRootIndex].address = NULL;
mExceptionalAllocs[value.mRootIndex].size = 0;
mAllocator->deallocate(ptr);
#if PX_DEBUG
mMemTracker.unregisterMemory(ptr, true);
#endif
return;
}
const PxU32 rootIndex = value.mRootIndex;
PxU32 blockIndex = value.mBlockIndex;
PxU32 offset = PxU32(reinterpret_cast<PxU8*>(ptr)-reinterpret_cast<PxU8*>(mRoots[rootIndex]));
Block* block = &mBlocks[blockIndex];
do
{
const PxU32 offsetToFind = (((offset / block->mBlockSize) & 1) == 0) ? offset + block->mBlockSize : offset - block->mBlockSize;
BlockHeader* buddyHeader = block->findBuddy(offsetToFind, rootIndex);
if (buddyHeader)
{
//current block need to remove the merged free header
block->removeBlockHeader(buddyHeader, mBlockHeaderPool);
if (block->isEmpty())
{
mBitfield = mBitfield & (~(1u << blockIndex));
}
offset = PxMin(offsetToFind, offset);
blockIndex = blockIndex + 1;
if (blockIndex < mBlocks.size())
{
block = &mBlocks[blockIndex];
}
else
{
block->insertBlockHeader(rootIndex, offset, mBlockHeaderPool);
mBitfield = mBitfield | (1u << blockIndex);
break;
}
}
else
{
PX_ASSERT(buddyHeader == NULL);
//just put it back to the block
block->insertBlockHeader(rootIndex, offset, mBlockHeaderPool);
mBitfield = mBitfield | (1u << blockIndex);
break;
}
} while (1);
#if PX_DEBUG
mMemTracker.unregisterMemory(ptr, true);
#endif
}
PxU64 PxgHeapMemoryAllocator::getTotalSize()
{
return mTotalMem;
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
PxgHeapMemoryAllocatorManager::PxgHeapMemoryAllocatorManager(PxU32 heapCapacity, PxsMemoryManager* memoryManager)
{
mDeviceMemoryAllocators = PX_NEW(PxgHeapMemoryAllocator)(heapCapacity, memoryManager->getDeviceMemoryAllocator());
mMappedMemoryAllocators = PX_NEW(PxgHeapMemoryAllocator)(heapCapacity, memoryManager->getHostMemoryAllocator());
}
PxgHeapMemoryAllocatorManager::~PxgHeapMemoryAllocatorManager()
{
PX_DELETE(mDeviceMemoryAllocators);
PX_DELETE(mMappedMemoryAllocators);
}
PxU64 PxgHeapMemoryAllocatorManager::getDeviceMemorySize() const
{
return mDeviceMemoryAllocators ? mDeviceMemoryAllocators->getTotalSize() : 0;
}
PxsHeapStats PxgHeapMemoryAllocatorManager::getDeviceHeapStats() const
{
if(mDeviceMemoryAllocators)
return mDeviceMemoryAllocators->getHeapStats();
else
return PxsHeapStats();
}
void PxgHeapMemoryAllocatorManager::flushDeferredDeallocs()
{
if (mDeviceMemoryAllocators) // this should actually never be null...
mDeviceMemoryAllocators->flushDeferredDeallocs();
}

View File

@@ -0,0 +1,52 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#include "PxgKernelWrangler.h"
#include "CudaKernelWrangler.h"
#include "foundation/PxAllocator.h"
#include "cudamanager/PxCudaContext.h"
using namespace physx;
static const char* kernelNames[]
{
#define KERNEL_DEF(id, name) name,
#include "PxgKernelNames.h"
#undef KERNEL_DEF
};
PxgCudaKernelWranglerManager::PxgCudaKernelWranglerManager(PxCudaContextManager& cudaContextManager, PxErrorCallback& errorCallback)
{
mCudaContextManager = &cudaContextManager;
mKernelWrangler = PX_NEW(KernelWrangler)(cudaContextManager, errorCallback, kernelNames, sizeof(kernelNames) / sizeof(char*));
}
PxgCudaKernelWranglerManager::~PxgCudaKernelWranglerManager()
{
PX_DELETE(mKernelWrangler);
}

View File

@@ -0,0 +1,67 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
#include "PxgMemCopyDispatcher.h"
#include "cudamanager/PxCudaContext.h"
#include "CudaKernelWrangler.h"
#include "PxgKernelIndices.h"
namespace physx
{
void PxgMemCopyDispatcher::flushCommands(CUstream stream, PxCudaContext* cudaContext, KernelWrangler* kernelWrangler)
{
// AD - this assumes the context lock is already held?
if (mPinnedCopyBuffer.size())
{
mDeviceCopyCommands.allocate(mPinnedCopyBuffer.size() * sizeof(PxgPtrPair), PX_FL);
cudaContext->memcpyHtoDAsync(mDeviceCopyCommands.getDevicePtr(), mPinnedCopyBuffer.begin(), mPinnedCopyBuffer.size() * sizeof(PxgPtrPair), stream);
CUfunction function = kernelWrangler->getCuFunction(PxgKernelIds::COPY_USER_DATA);
PX_ASSERT(mMaxSize <= PX_MAX_U32);
const PxU32 maxS = PxU32(mMaxSize);
const PxU32 blockSize = 256;
const PxU32 numBlocks = ((maxS/4) + blockSize-1)/ blockSize;
CUdeviceptr ptr = mDeviceCopyCommands.getDevicePtr();
PxU32 count = mPinnedCopyBuffer.size();
PxCudaKernelParam kernelParams[] =
{
PX_CUDA_KERNEL_PARAM(ptr),
PX_CUDA_KERNEL_PARAM(count)
};
CUresult launchResult = cudaContext->launchKernel(function, numBlocks, count, 1, 256, 1, 1, 0, stream, kernelParams, sizeof(kernelParams), 0, PX_FL);
PX_ASSERT(launchResult == CUDA_SUCCESS);
PX_UNUSED(launchResult);
}
mPinnedCopyBuffer.forceSize_Unsafe(0);
mMaxSize = 0;
}
}

View File

@@ -0,0 +1,112 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#include "PxgMemoryManager.h"
#include "PxgCudaMemoryAllocator.h"
using namespace physx;
namespace physx
{
PxgCudaAllocatorCallbackBase::PxgCudaAllocatorCallbackBase(PxCudaContextManager* contextManager) : mContextManager(contextManager), mCudaContext(contextManager->getCudaContext()) {}
}
namespace
{
// PT: this one calls PxgPinnedMemoryAllocate/PxgPinnedMemoryDeallocate, i.e. cuMemHostAlloc/cuMemFreeHost
class PxgCudaHostMemoryAllocatorCallback : public PxgCudaAllocatorCallbackBase
{
public:
PxgCudaHostMemoryAllocatorCallback(PxCudaContextManager* contextManager) : PxgCudaAllocatorCallbackBase(contextManager) {}
// PxVirtualAllocatorCallback
virtual void* allocate(size_t size, int, const char* file, int line) PX_OVERRIDE PX_FINAL
{
PxScopedCudaLock lock(*mContextManager);
return PxgPinnedMemoryAllocate(*mCudaContext, size, file, line);
}
virtual void deallocate(void* ptr) PX_OVERRIDE PX_FINAL
{
if(ptr)
{
PxScopedCudaLock lock(*mContextManager);
PxgPinnedMemoryDeallocate(*mCudaContext, ptr);
}
}
//~PxVirtualAllocatorCallback
};
// PT: this one calls PxgCudaDeviceMemoryAllocate/PxgCudaDeviceMemoryDeallocate, i.e. cuMemAlloc/cuMemFree
class PxgCudaDeviceMemoryAllocatorCallback : public PxgCudaAllocatorCallbackBase
{
public:
PxgCudaDeviceMemoryAllocatorCallback(PxCudaContextManager* contextManager) : PxgCudaAllocatorCallbackBase(contextManager) {}
// PxVirtualAllocatorCallback
virtual void* allocate(size_t size, int, const char* file, int line) PX_OVERRIDE PX_FINAL
{
PxScopedCudaLock lock(*mContextManager);
return PxgCudaDeviceMemoryAllocate(*mCudaContext, size, file, line);
}
virtual void deallocate(void* ptr) PX_OVERRIDE PX_FINAL
{
if(ptr)
{
PxScopedCudaLock lock(*mContextManager);
PxgCudaDeviceMemoryDeallocate(*mCudaContext, ptr);
}
}
//~PxVirtualAllocatorCallback
};
class PxgMemoryManager : public PxsMemoryManager
{
public:
PxgMemoryManager(PxCudaContextManager* cudaContextManager) : mCudaContextManager(cudaContextManager), mHostMemoryAllocator(cudaContextManager), mDeviceMemoryAllocator(cudaContextManager) {}
virtual ~PxgMemoryManager() {}
// PxsMemoryManager
virtual PxVirtualAllocatorCallback* getHostMemoryAllocator() PX_OVERRIDE { return &mHostMemoryAllocator; }
virtual PxVirtualAllocatorCallback* getDeviceMemoryAllocator() PX_OVERRIDE { return &mDeviceMemoryAllocator; }
//~PxsMemoryManager
PxCudaContextManager* mCudaContextManager;
PxgCudaHostMemoryAllocatorCallback mHostMemoryAllocator;
PxgCudaDeviceMemoryAllocatorCallback mDeviceMemoryAllocator;
};
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
PxsMemoryManager* physx::createPxgMemoryManager(PxCudaContextManager* cudaContextManager)
{
return PX_NEW(PxgMemoryManager)(cudaContextManager);
}