feat(physics): wire physx sdk into build

This commit is contained in:
2026-04-15 12:22:15 +08:00
parent 5bf258df6d
commit 31f40e2cbb
2044 changed files with 752623 additions and 1 deletions

View File

@@ -0,0 +1,570 @@
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#include "PxgShapeManager.h"
#include "PxgCopyManager.h"
#include "PxgHeapMemAllocator.h"
#include "PxgCudaUtils.h"
#include "PxNodeIndex.h"
using namespace physx;
PxgShapeManager::PxgShapeManager(PxgHeapMemoryAllocatorManager* heapManager) :
mHeapManager(heapManager),
mHostShapes(PxVirtualAllocator(heapManager->mMappedMemoryAllocators, PxsHeapStats::eNARROWPHASE)),
mHostShapesRemapTable(PxVirtualAllocator(heapManager->mMappedMemoryAllocators, PxsHeapStats::eNARROWPHASE)),
mHostShapeIdTable(PxVirtualAllocator(heapManager->mMappedMemoryAllocators, PxsHeapStats::eNARROWPHASE)),
mHostTransformCacheIdToActorTable(PxVirtualAllocator(heapManager->mMappedMemoryAllocators, PxsHeapStats::eNARROWPHASE)),
mGpuShapesBuffer(heapManager, PxsHeapStats::eNARROWPHASE),
mGpuShapesRemapTableBuffer(heapManager, PxsHeapStats::eNARROWPHASE),
mGpuTransformCacheIdToActorTableBuffer(heapManager, PxsHeapStats::eNARROWPHASE),
mGpuRigidIndiceBuffer(heapManager, PxsHeapStats::eNARROWPHASE),
mGpuShapeIndiceBuffer(heapManager, PxsHeapStats::eNARROWPHASE),
mGpuUnsortedShapeIndicesBuffer(heapManager, PxsHeapStats::eNARROWPHASE),
mGpuTempRigidBitIndiceBuffer(heapManager, PxsHeapStats::eNARROWPHASE),
mGpuTempRigidIndiceBuffer(heapManager, PxsHeapStats::eNARROWPHASE)
{
//allocate x4
const PxU32 initialSize = 128;
mHostShapes.forceSize_Unsafe(0);
mHostShapes.reserve(initialSize);
mHostShapes.forceSize_Unsafe(initialSize);
mHostShapesRemapTable.forceSize_Unsafe(0);
mHostShapesRemapTable.reserve(initialSize);
mHostShapesRemapTable.forceSize_Unsafe(initialSize);
mHostShapeIdTable.forceSize_Unsafe(0);
mHostShapeIdTable.reserve(initialSize);
mHostShapeIdTable.forceSize_Unsafe(initialSize);
mHostTransformCacheIdToActorTable.forceSize_Unsafe(0);
mHostTransformCacheIdToActorTable.reserve(initialSize);
mHostTransformCacheIdToActorTable.forceSize_Unsafe(initialSize);
mGpuShapesBuffer.allocate(sizeof(PxgShape)*initialSize, PX_FL);
mGpuShapesRemapTableBuffer.allocate(sizeof(PxNodeIndex) * initialSize, PX_FL);
mGpuTransformCacheIdToActorTableBuffer.allocate(sizeof(PxActor*) * initialSize, PX_FL);
mGpuRigidIndiceBuffer.allocate(sizeof(PxNodeIndex) * initialSize, PX_FL);
mGpuShapeIndiceBuffer.allocate(sizeof(PxU32) * initialSize, PX_FL);
mGpuUnsortedShapeIndicesBuffer.allocate(sizeof(PxU32) * initialSize, PX_FL);
mGpuTempRigidBitIndiceBuffer.allocate(sizeof(PxU32) * initialSize, PX_FL);
mGpuTempRigidIndiceBuffer.allocate(sizeof(PxNodeIndex) * initialSize, PX_FL);
mResizeRequired = false;
mTransformCacheResizeRequired = false;
mMaxShapeId = -1;
mMaxTransformCacheID = -1;
mHasShapeChanged = false;
mHasShapeInstanceChanged = false;
}
void PxgShapeManager::initialize(PxCudaContext* cudaContext, CUstream stream)
{
cudaContext->memsetD32Async(mGpuShapesRemapTableBuffer.getDevicePtr(), 0xFFFFFFFF, mGpuShapesRemapTableBuffer.getSize()/sizeof(PxU32), stream);
cudaContext->memsetD32Async(mGpuRigidIndiceBuffer.getDevicePtr(), 0xFFFFFFFF, mGpuRigidIndiceBuffer.getSize() / sizeof(PxU32), stream);
cudaContext->memsetD32Async(mGpuShapeIndiceBuffer.getDevicePtr(), 0xFFFFFFFF, mGpuShapeIndiceBuffer.getSize() / sizeof(PxU32), stream);
cudaContext->memsetD32Async(mGpuUnsortedShapeIndicesBuffer.getDevicePtr(), 0xFFFFFFFF, mGpuUnsortedShapeIndicesBuffer.getSize() / sizeof(PxU32), stream);
}
PxU32 PxgShapeManager::registerShape(PxgShape& shape)
{
const PxU32 shapeId = mIdPool.getNewID();
if (shapeId >= mHostShapes.capacity())
{
mResizeRequired = true;
const PxU32 capacity = shapeId * 2;
//make sure capacity is x4 because we need to use radix sort to sort shape id based on rigid body index later
const PxU32 tempCapacity = (capacity + 3)&(~3);
mHostShapes.resize(tempCapacity);
mDirtyShapeMap.resize(tempCapacity);
}
mHostShapes[shapeId] = shape;
mDirtyShapeMap.growAndSet(shapeId);
mMaxShapeId = PxMax(PxI32(shapeId), mMaxShapeId);
mHasShapeChanged = true;
return shapeId;
}
void PxgShapeManager::registerShapeInstance(const PxNodeIndex& nodeIndex, const PxU32 transformCacheID, PxActor* actor, bool aggregate)
{
if (transformCacheID >= mHostShapesRemapTable.capacity())
{
const PxU32 capacity = transformCacheID*2;
//make sure capacity is x4 because we need to use radix sort to sort shape id based on rigid body index later
const PxU32 tempCapacity = (capacity + 3)&(~3);
mTransformCacheResizeRequired = true;
mHostShapesRemapTable.resize(tempCapacity);
mHostShapeIdTable.resize(tempCapacity);
mHostTransformCacheIdToActorTable.resize(tempCapacity);
mDirtyTransformCacheMap.resize(tempCapacity);
}
mHostShapesRemapTable[transformCacheID] = nodeIndex;
mHostShapeIdTable[transformCacheID] = aggregate? 0xffffffff : transformCacheID;
mHostTransformCacheIdToActorTable[transformCacheID] = aggregate ? NULL : actor;
mHasShapeInstanceChanged = true;
mDirtyTransformCacheMap.growAndSet(transformCacheID);
mMaxTransformCacheID = PxMax(PxI32(transformCacheID), mMaxTransformCacheID);
}
void PxgShapeManager::unregisterShape(const PxU32 id)
{
mDirtyShapeMap.reset(id);
mIdPool.deferredFreeID(id);
mHasShapeChanged = true;
}
void PxgShapeManager::unregisterShapeInstance(const PxU32 transformCacheID)
{
mDirtyTransformCacheMap.set(transformCacheID);
mHostShapesRemapTable[transformCacheID] = PxNodeIndex(PX_INVALID_NODE);
mHostShapeIdTable[transformCacheID] = 0xffffffff;
mHostTransformCacheIdToActorTable[transformCacheID] = NULL;
mHasShapeInstanceChanged = true;
}
void PxgShapeManager::scheduleCopyHtoD(PxgCopyManager& copyManager, PxCudaContext* cudaContext, CUstream stream)
{
PX_UNUSED(copyManager);
const PxU32 maxGrouping = 16;
if (mHasShapeChanged)
{
mHasShapeChanged = false;
if (mResizeRequired)
{
//Allocate and copy data across
mGpuShapesBuffer.allocateCopyOldDataAsync(sizeof(PxgShape)*mHostShapes.capacity(), cudaContext, stream, PX_FL);
mResizeRequired = false;
}
const PxU32* bits = mDirtyShapeMap.getWords();
if (bits)
{
const PxU32 totalNumOfShapes = mMaxShapeId + 1;
const PxU32 numShapes = (totalNumOfShapes + 3) &(~3);
//make sure the dirty shape map cover x4 case and set those to invalid value
for (PxU32 i = totalNumOfShapes; i < numShapes; ++i)
{
mDirtyShapeMap.growAndSet(i);
}
// PT: ### bitmap iterator pattern
const PxU32 lastSetBit = mDirtyShapeMap.findLast();
for (PxU32 w = 0; w <= lastSetBit >> 5; ++w)
{
//b&=b-1 will clear the lowest set bit in b
for (PxU32 b = bits[w]; b; )
{
//dirtyId is the next bit that's set to 1!
const PxU32 dirtyId = PxU32(w << 5 | PxLowestSetBit(b));
void* hostPtr = mHostShapes.begin() + dirtyId;
PxgCopyManager::CopyDesc desc;
desc.source = reinterpret_cast<size_t>(getMappedDevicePtr(cudaContext, hostPtr));
desc.dest = reinterpret_cast<size_t>(reinterpret_cast<PxU8*>(mGpuShapesBuffer.getDevicePtr()) + dirtyId * sizeof(PxgShape));
desc.bytes = sizeof(PxgShape);
mDirtyShapeMap.reset(dirtyId);
//Now we loop to try and find adjacent bits that are set...
PxU32 currIdx = dirtyId + 1;
PxU32 groupSize = 1;
while (currIdx <= lastSetBit && mDirtyShapeMap.test(currIdx) && groupSize < maxGrouping)
{
groupSize++;
mDirtyShapeMap.reset(currIdx);
currIdx++;
desc.bytes += sizeof(PxgShape);
}
if (currIdx != (dirtyId + 1))
{
//get the word from the current bit
w = PxMin(currIdx, lastSetBit) >> 5;
//reload the world
b = bits[w]; //Set a 1 here to make sure the b &= (b-1) in the for loop doesn't remove the current bit we're interested in
}
else
{
b &= (b - 1);
}
copyManager.pushDeferredHtoD(desc);
}
}
}
mDirtyShapeMap.clear();
}
if (mHasShapeInstanceChanged)
{
//AD: mHasShapeInstanceChanged needs to persist because computeRigidsToShapes() needs to run if we use direct-API
// we lower the flag in PxgNarrowphaseCore::prepareGpuNarrowphase.
// AD: the resize of the GPU transform cache is inside PxgNarrowphaseCore::prepareGpuNarrowphase.
if (mTransformCacheResizeRequired)
{
PxU64 oldCapacity = mGpuShapesRemapTableBuffer.getSize();
mGpuShapesRemapTableBuffer.allocateCopyOldDataAsync(sizeof(PxNodeIndex)*mHostShapesRemapTable.capacity(), cudaContext, stream, PX_FL);
cudaContext->memsetD32Async(mGpuShapesRemapTableBuffer.getDevicePtr() + oldCapacity, 0xFFFFFFFF, (mGpuShapesRemapTableBuffer.getSize() - oldCapacity) / sizeof(PxU32), stream);
oldCapacity = mGpuRigidIndiceBuffer.getSize();
mGpuRigidIndiceBuffer.allocateCopyOldDataAsync(sizeof(PxNodeIndex) * mHostShapesRemapTable.capacity(), cudaContext, stream, PX_FL);
cudaContext->memsetD32Async(mGpuRigidIndiceBuffer.getDevicePtr() + oldCapacity, 0xFFFFFFFF, (mGpuRigidIndiceBuffer.getSize() - oldCapacity) / sizeof(PxU32), stream);
mGpuTempRigidIndiceBuffer.allocate(sizeof(PxNodeIndex) * mHostShapesRemapTable.capacity(), PX_FL);
oldCapacity = mGpuShapeIndiceBuffer.getSize();
mGpuShapeIndiceBuffer.allocateCopyOldDataAsync(sizeof(PxU32) * mHostShapeIdTable.capacity(), cudaContext, stream, PX_FL);
cudaContext->memsetD32Async(mGpuShapeIndiceBuffer.getDevicePtr() + oldCapacity, 0xFFFFFFFF, (mGpuShapeIndiceBuffer.getSize() - oldCapacity) / sizeof(PxU32), stream);
oldCapacity = mGpuUnsortedShapeIndicesBuffer.getSize();
mGpuUnsortedShapeIndicesBuffer.allocateCopyOldDataAsync(sizeof(PxU32) * mHostShapeIdTable.capacity(), cudaContext, stream, PX_FL);
cudaContext->memsetD32Async(mGpuUnsortedShapeIndicesBuffer.getDevicePtr() + oldCapacity, 0xFFFFFFFF, (mGpuUnsortedShapeIndicesBuffer.getSize() - oldCapacity) / sizeof(PxU32), stream);
mGpuTempRigidBitIndiceBuffer.allocate(sizeof(PxU32) * mHostShapeIdTable.capacity(), PX_FL);
oldCapacity = mGpuTransformCacheIdToActorTableBuffer.getSize();
mGpuTransformCacheIdToActorTableBuffer.allocateCopyOldDataAsync(sizeof(PxActor*) * mHostTransformCacheIdToActorTable.capacity(), cudaContext, stream, PX_FL);
cudaContext->memsetD32Async(mGpuTransformCacheIdToActorTableBuffer.getDevicePtr() + oldCapacity, 0, (mGpuTransformCacheIdToActorTableBuffer.getSize() - oldCapacity) / sizeof(PxActor*), stream);
mTransformCacheResizeRequired = false;
}
const PxU32 totalNumOfShapeInstances = mMaxTransformCacheID + 1;
const PxU32 numShapeInstances = (totalNumOfShapeInstances + 3) &(~3);
//make sure the dirty shape map cover x4 case and set those to invalid value
for (PxU32 i = totalNumOfShapeInstances; i < numShapeInstances; ++i)
{
if (!mHostShapesRemapTable[i].isStaticBody())
{
mDirtyTransformCacheMap.growAndSet(i);
mHostShapesRemapTable[i] = PxNodeIndex(PX_INVALID_NODE);
mHostShapeIdTable[i] = 0xffffffff;
mHostTransformCacheIdToActorTable[i] = NULL;
}
}
const PxU32* bits = mDirtyTransformCacheMap.getWords();
if (bits)
{
// PT: ### bitmap iterator pattern
const PxU32 lastSetBit = mDirtyTransformCacheMap.findLast();
for (PxU32 w = 0; w <= lastSetBit >> 5; ++w)
{
//b&=b-1 will clear the lowest set bit in b
for (PxU32 b = bits[w]; b; )
{
//dirtyId is the next bit that's set to 1!
const PxU32 dirtyId = PxU32(w << 5 | PxLowestSetBit(b));
void* hostRemapPtr = mHostShapesRemapTable.begin() + dirtyId;
void* hostShapeIdPtr = mHostShapeIdTable.begin() + dirtyId;
void* hostTransformCacheIdToActorPtr = mHostTransformCacheIdToActorTable.begin() + dirtyId;
PxgCopyManager::CopyDesc desc1;
desc1.source = reinterpret_cast<size_t>(getMappedDevicePtr(cudaContext, hostRemapPtr));
desc1.dest = reinterpret_cast<size_t>(reinterpret_cast<PxU8*>(mGpuShapesRemapTableBuffer.getDevicePtr()) + dirtyId * sizeof(PxNodeIndex));
desc1.bytes = sizeof(PxNodeIndex);
PxgCopyManager::CopyDesc desc2;
desc2.source = reinterpret_cast<size_t>(getMappedDevicePtr(cudaContext, hostRemapPtr));
desc2.dest = reinterpret_cast<size_t>(reinterpret_cast<PxU8*>(mGpuRigidIndiceBuffer.getDevicePtr()) + dirtyId * sizeof(PxNodeIndex));
desc2.bytes = sizeof(PxNodeIndex);
PxgCopyManager::CopyDesc desc3;
desc3.source = reinterpret_cast<size_t>(getMappedDevicePtr(cudaContext, hostShapeIdPtr));
desc3.dest = reinterpret_cast<size_t>(reinterpret_cast<PxU8*>(mGpuUnsortedShapeIndicesBuffer.getDevicePtr()) + dirtyId * sizeof(PxU32));
desc3.bytes = sizeof(PxU32);
PxgCopyManager::CopyDesc desc4;
desc4.source = reinterpret_cast<size_t>(getMappedDevicePtr(cudaContext, hostTransformCacheIdToActorPtr));
desc4.dest = reinterpret_cast<size_t>(reinterpret_cast<PxU8*>(mGpuTransformCacheIdToActorTableBuffer.getDevicePtr()) + dirtyId * sizeof(PxActor*));
desc4.bytes = sizeof(PxActor*);
mDirtyTransformCacheMap.reset(dirtyId);
//Now we loop to try and find adjacent bits that are set...
PxU32 currIdx = dirtyId + 1;
PxU32 groupSize = 1;
while (currIdx <= lastSetBit && mDirtyTransformCacheMap.test(currIdx) && groupSize < maxGrouping)
{
groupSize++;
mDirtyTransformCacheMap.reset(currIdx);
currIdx++;
desc1.bytes += sizeof(PxNodeIndex);
desc2.bytes += sizeof(PxNodeIndex);
desc3.bytes += sizeof(PxU32);
desc4.bytes += sizeof(PxActor*);
}
if (currIdx != (dirtyId + 1))
{
//get the word from the current bit
w = PxMin(currIdx, lastSetBit) >> 5;
//reload the world
b = bits[w]; //Set a 1 here to make sure the b &= (b-1) in the for loop doesn't remove the current bit we're interested in
}
else
{
b &= (b - 1);
}
copyManager.pushDeferredHtoD(desc1);
copyManager.pushDeferredHtoD(desc2);
copyManager.pushDeferredHtoD(desc3);
copyManager.pushDeferredHtoD(desc4);
}
}
}
mDirtyTransformCacheMap.clear();
}
}
void PxgShapeManager::updateShapeMaterial(const PxU32 materialIndex, const PxU32 id)
{
PX_ASSERT(id < mHostShapes.size());
mHostShapes[id].materialIndex = materialIndex;
mDirtyShapeMap.growAndSet(id);
mHasShapeChanged = true;
}
////////////////////////////////////////////////////////////////////////////////////////////
PxgMaterialManager::PxgMaterialManager(PxgHeapMemoryAllocatorManager* heapManager, const PxU32 elemSize) :
mGpuMaterialBuffer(heapManager, PxsHeapStats::eNARROWPHASE),
mHeapManager(heapManager),
mHostMaterial(PxVirtualAllocator(heapManager->mMappedMemoryAllocators, PxsHeapStats::eNARROWPHASE))
{
const PxU32 originalSize = elemSize * 128;
mHostMaterial.forceSize_Unsafe(0);
mHostMaterial.reserve(originalSize);
mHostMaterial.forceSize_Unsafe(originalSize);
mGpuMaterialBuffer.allocate(originalSize, PX_FL);
mResizeRequired = false;
}
PxU32 PxgMaterialManager::registerMaterial(const PxU8* materialData, const PxU32 elemSize)
{
const PxU32 shapeId = mIdPool.getNewID();
PxU32 capacity = mHostMaterial.capacity() / elemSize;
if (shapeId >= capacity)
{
capacity = PxMax(capacity * 2 + 1, shapeId + 1);
mHostMaterial.resize(capacity * elemSize);
mResizeRequired = true;
}
PxU8* destPtr = mHostMaterial.begin() + shapeId * elemSize;
PxMemCopy(destPtr, materialData, elemSize);
mDirtyMaterialMap.growAndSet(shapeId);
return shapeId;
}
void PxgMaterialManager::unregisterMaterial(const PxU32 id)
{
mDirtyMaterialMap.reset(id);
mIdPool.deferredFreeID(id);
}
void PxgMaterialManager::scheduleCopyHtoD(PxgCopyManager& copyManager, PxCudaContext* cudaContext,
CUstream stream, const PxU32 elemSize)
{
if (mResizeRequired)
{
mGpuMaterialBuffer.allocateCopyOldDataAsync(mHostMaterial.capacity(), cudaContext, stream, PX_FL);
mResizeRequired = false;
}
const PxU32* bits = mDirtyMaterialMap.getWords();
const PxU32 maxGrouping = 16;
if (bits)
{
// PT: ### bitmap iterator pattern
const PxU32 lastSetBit = mDirtyMaterialMap.findLast();
for (PxU32 w = 0; w <= lastSetBit >> 5; ++w)
{
//b&=b-1 will clear the lowest set bit in b
for (PxU32 b = bits[w]; b; )
{
//dirtyId is the next bit that's set to 1!
const PxU32 dirtyId = PxU32(w << 5 | PxLowestSetBit(b));
void* hostPtr = mHostMaterial.begin() + dirtyId * elemSize;
PxgCopyManager::CopyDesc desc;
desc.source = reinterpret_cast<size_t>(getMappedDevicePtr(cudaContext, hostPtr));
desc.dest = reinterpret_cast<size_t>(reinterpret_cast<PxU8*>(mGpuMaterialBuffer.getDevicePtr()) + dirtyId * elemSize);
desc.bytes = elemSize;
mDirtyMaterialMap.reset(dirtyId);
//Now we loop to try and find adjacent bits that are set...
PxU32 currIdx = dirtyId + 1;
PxU32 groupSize = 1;
while (currIdx <= lastSetBit && mDirtyMaterialMap.test(currIdx) && (groupSize < maxGrouping))
{
groupSize++;
mDirtyMaterialMap.reset(currIdx);
currIdx++;
desc.bytes += elemSize;
}
if (currIdx != (dirtyId + 1))
{
//get the word from the current bit
w = PxMin(currIdx, lastSetBit) >> 5;
//reload the world
b = bits[w]; //Set a 1 here to make sure the b &= (b-1) in the for loop doesn't remove the current bit we're interested in
}
else
{
b &= (b - 1);
}
copyManager.pushDeferredHtoD(desc);
}
}
}
mDirtyMaterialMap.clear();
}
void PxgMaterialManager::updateMaterial(const PxU8* materialCore, const PxU32 elemSize, const PxU32 id)
{
PX_ASSERT(id < mHostMaterial.size());
PxU8* destptr = reinterpret_cast<PxU8*>(mHostMaterial.begin() + id * elemSize);
PxMemCopy(destptr, materialCore, elemSize);
//mHostMaterial[id] = materialCore;
mDirtyMaterialMap.growAndSet(id);
}
//////////////////////////////////////////////////////////////////////////////////////////////
PxgFEMMaterialManager::PxgFEMMaterialManager(PxgHeapMemoryAllocatorManager* heapManager, const PxU32 elemSize) :
PxgMaterialManager(heapManager, elemSize)
{
}
void PxgFEMMaterialManager::scheduleCopyHtoD(PxgCopyManager& copyManager, PxCudaContext* cudaContext,
CUstream stream, const PxU32 elemSize)
{
if (mResizeRequired)
{
mGpuMaterialBuffer.allocateCopyOldDataAsync(mHostMaterial.capacity(), cudaContext, stream, PX_FL);
mResizeRequired = false;
}
const PxU32* bits = mDirtyMaterialMap.getWords();
const PxU32 maxGrouping = 16;
if (bits)
{
// PT: ### bitmap iterator pattern
const PxU32 lastSetBit = mDirtyMaterialMap.findLast();
for (PxU32 w = 0; w <= lastSetBit >> 5; ++w)
{
//b&=b-1 will clear the lowest set bit in b
for (PxU32 b = bits[w]; b; )
{
//dirtyId is the next bit that's set to 1!
const PxU32 dirtyId = PxU32(w << 5 | PxLowestSetBit(b));
void* hostPtr = mHostMaterial.begin() + dirtyId * elemSize;
PxgCopyManager::CopyDesc desc;
desc.source = reinterpret_cast<size_t>(getMappedDevicePtr(cudaContext, hostPtr));
desc.dest = reinterpret_cast<size_t>(reinterpret_cast<PxU8*>(mGpuMaterialBuffer.getDevicePtr()) + dirtyId * elemSize);
desc.bytes = elemSize;
mDirtyMaterialMap.reset(dirtyId);
//Now we loop to try and find adjacent bits that are set...
PxU32 currIdx = dirtyId + 1;
PxU32 groupSize = 1;
while (currIdx <= lastSetBit && mDirtyMaterialMap.test(currIdx) && (groupSize < maxGrouping))
{
groupSize++;
mDirtyMaterialMap.reset(currIdx);
currIdx++;
desc.bytes += elemSize;
}
if (currIdx != (dirtyId + 1))
{
//get the word from the current bit
w = PxMin(currIdx, lastSetBit) >> 5;
//reload the world
b = bits[w]; //Set a 1 here to make sure the b &= (b-1) in the for loop doesn't remove the current bit we're interested in
}
else
{
b &= (b - 1);
}
copyManager.pushDeferredHtoD(desc);
}
}
}
mDirtyMaterialMap.clear();
}
////////////////////////////////////////////////////////////////////////////////////////////
PxgFEMSoftBodyMaterialManager::PxgFEMSoftBodyMaterialManager(PxgHeapMemoryAllocatorManager* heapManager) :
PxgFEMMaterialManager(heapManager, sizeof(PxsDeformableVolumeMaterialData))
{
}