feat(physics): wire physx sdk into build

2026-04-15 12:22:15 +08:00
parent 5bf258df6d
commit 31f40e2cbb
2044 changed files with 752623 additions and 1 deletions
--- a/engine/third_party/physx/source/gpunarrowphase/src/PxgShapeManager.cpp
+++ b/engine/third_party/physx/source/gpunarrowphase/src/PxgShapeManager.cpp
@@ -0,0 +1,570 @@
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//  * Neither the name of NVIDIA CORPORATION nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
+// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
+// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  
+
+#include "PxgShapeManager.h"
+#include "PxgCopyManager.h"
+#include "PxgHeapMemAllocator.h"
+#include "PxgCudaUtils.h"
+#include "PxNodeIndex.h"
+
+using namespace physx;
+
+PxgShapeManager::PxgShapeManager(PxgHeapMemoryAllocatorManager* heapManager) :
+	mHeapManager(heapManager),
+	mHostShapes(PxVirtualAllocator(heapManager->mMappedMemoryAllocators, PxsHeapStats::eNARROWPHASE)),
+	mHostShapesRemapTable(PxVirtualAllocator(heapManager->mMappedMemoryAllocators, PxsHeapStats::eNARROWPHASE)),
+	mHostShapeIdTable(PxVirtualAllocator(heapManager->mMappedMemoryAllocators, PxsHeapStats::eNARROWPHASE)),
+	mHostTransformCacheIdToActorTable(PxVirtualAllocator(heapManager->mMappedMemoryAllocators, PxsHeapStats::eNARROWPHASE)),
+	mGpuShapesBuffer(heapManager, PxsHeapStats::eNARROWPHASE),
+	mGpuShapesRemapTableBuffer(heapManager, PxsHeapStats::eNARROWPHASE),
+	mGpuTransformCacheIdToActorTableBuffer(heapManager, PxsHeapStats::eNARROWPHASE),
+	mGpuRigidIndiceBuffer(heapManager, PxsHeapStats::eNARROWPHASE),
+	mGpuShapeIndiceBuffer(heapManager, PxsHeapStats::eNARROWPHASE),
+	mGpuUnsortedShapeIndicesBuffer(heapManager, PxsHeapStats::eNARROWPHASE),
+	mGpuTempRigidBitIndiceBuffer(heapManager, PxsHeapStats::eNARROWPHASE),
+	mGpuTempRigidIndiceBuffer(heapManager, PxsHeapStats::eNARROWPHASE)
+{
+	//allocate x4
+	const PxU32 initialSize = 128;
+	mHostShapes.forceSize_Unsafe(0);
+	mHostShapes.reserve(initialSize);
+	mHostShapes.forceSize_Unsafe(initialSize);
+
+	mHostShapesRemapTable.forceSize_Unsafe(0);
+	mHostShapesRemapTable.reserve(initialSize);
+	mHostShapesRemapTable.forceSize_Unsafe(initialSize);
+
+	mHostShapeIdTable.forceSize_Unsafe(0);
+	mHostShapeIdTable.reserve(initialSize);
+	mHostShapeIdTable.forceSize_Unsafe(initialSize);
+
+	mHostTransformCacheIdToActorTable.forceSize_Unsafe(0);
+	mHostTransformCacheIdToActorTable.reserve(initialSize);
+	mHostTransformCacheIdToActorTable.forceSize_Unsafe(initialSize);
+
+	mGpuShapesBuffer.allocate(sizeof(PxgShape)*initialSize, PX_FL);
+	mGpuShapesRemapTableBuffer.allocate(sizeof(PxNodeIndex) * initialSize, PX_FL);
+	mGpuTransformCacheIdToActorTableBuffer.allocate(sizeof(PxActor*) * initialSize, PX_FL);
+	mGpuRigidIndiceBuffer.allocate(sizeof(PxNodeIndex) * initialSize, PX_FL);
+	mGpuShapeIndiceBuffer.allocate(sizeof(PxU32) * initialSize, PX_FL);
+	mGpuUnsortedShapeIndicesBuffer.allocate(sizeof(PxU32) * initialSize, PX_FL);
+	mGpuTempRigidBitIndiceBuffer.allocate(sizeof(PxU32) * initialSize, PX_FL);
+	mGpuTempRigidIndiceBuffer.allocate(sizeof(PxNodeIndex) * initialSize, PX_FL);
+
+	mResizeRequired = false;
+	mTransformCacheResizeRequired = false;
+	mMaxShapeId = -1;
+	mMaxTransformCacheID = -1;
+
+	mHasShapeChanged = false;
+	mHasShapeInstanceChanged = false;
+}
+
+void PxgShapeManager::initialize(PxCudaContext* cudaContext, CUstream stream)
+{
+	cudaContext->memsetD32Async(mGpuShapesRemapTableBuffer.getDevicePtr(), 0xFFFFFFFF, mGpuShapesRemapTableBuffer.getSize()/sizeof(PxU32), stream);
+	cudaContext->memsetD32Async(mGpuRigidIndiceBuffer.getDevicePtr(), 0xFFFFFFFF, mGpuRigidIndiceBuffer.getSize() / sizeof(PxU32), stream);
+	cudaContext->memsetD32Async(mGpuShapeIndiceBuffer.getDevicePtr(), 0xFFFFFFFF, mGpuShapeIndiceBuffer.getSize() / sizeof(PxU32), stream);
+	cudaContext->memsetD32Async(mGpuUnsortedShapeIndicesBuffer.getDevicePtr(), 0xFFFFFFFF, mGpuUnsortedShapeIndicesBuffer.getSize() / sizeof(PxU32), stream);
+	
+}
+
+PxU32 PxgShapeManager::registerShape(PxgShape& shape)
+{
+	const PxU32 shapeId = mIdPool.getNewID();
+
+	if (shapeId >= mHostShapes.capacity())
+	{
+		mResizeRequired = true;
+		const PxU32 capacity = shapeId * 2;
+
+		//make sure capacity is x4 because we need to use radix sort to sort shape id based on rigid body index later
+		const PxU32 tempCapacity = (capacity + 3)&(~3);
+		mHostShapes.resize(tempCapacity);
+		mDirtyShapeMap.resize(tempCapacity);
+	}
+
+	mHostShapes[shapeId] = shape;
+	mDirtyShapeMap.growAndSet(shapeId);
+
+	mMaxShapeId = PxMax(PxI32(shapeId), mMaxShapeId);
+
+	mHasShapeChanged = true;
+
+	return shapeId;
+}
+
+void PxgShapeManager::registerShapeInstance(const PxNodeIndex& nodeIndex, const PxU32 transformCacheID, PxActor* actor, bool aggregate)
+{
+	if (transformCacheID >= mHostShapesRemapTable.capacity())
+	{
+		const PxU32 capacity = transformCacheID*2;
+		//make sure capacity is x4 because we need to use radix sort to sort shape id based on rigid body index later
+		const PxU32 tempCapacity = (capacity + 3)&(~3);
+		mTransformCacheResizeRequired = true;
+		mHostShapesRemapTable.resize(tempCapacity);
+		mHostShapeIdTable.resize(tempCapacity);
+		mHostTransformCacheIdToActorTable.resize(tempCapacity);
+		mDirtyTransformCacheMap.resize(tempCapacity);
+	}
+		
+	mHostShapesRemapTable[transformCacheID] = nodeIndex;
+	mHostShapeIdTable[transformCacheID] = aggregate? 0xffffffff : transformCacheID;
+	mHostTransformCacheIdToActorTable[transformCacheID] = aggregate ? NULL : actor;
+	mHasShapeInstanceChanged = true;
+	mDirtyTransformCacheMap.growAndSet(transformCacheID);
+	mMaxTransformCacheID = PxMax(PxI32(transformCacheID), mMaxTransformCacheID);
+}
+
+void PxgShapeManager::unregisterShape(const PxU32 id)
+{
+	mDirtyShapeMap.reset(id);
+	mIdPool.deferredFreeID(id);
+	mHasShapeChanged = true;
+}
+
+void PxgShapeManager::unregisterShapeInstance(const PxU32 transformCacheID)
+{
+	mDirtyTransformCacheMap.set(transformCacheID);
+	mHostShapesRemapTable[transformCacheID] = PxNodeIndex(PX_INVALID_NODE);
+	mHostShapeIdTable[transformCacheID] = 0xffffffff;
+	mHostTransformCacheIdToActorTable[transformCacheID] = NULL;
+	mHasShapeInstanceChanged = true;
+}
+
+void PxgShapeManager::scheduleCopyHtoD(PxgCopyManager& copyManager, PxCudaContext* cudaContext, CUstream stream)
+{
+	PX_UNUSED(copyManager);
+
+	const PxU32 maxGrouping = 16;
+
+	if (mHasShapeChanged)
+	{
+		mHasShapeChanged = false;
+
+		if (mResizeRequired)
+		{
+			//Allocate and copy data across
+			mGpuShapesBuffer.allocateCopyOldDataAsync(sizeof(PxgShape)*mHostShapes.capacity(), cudaContext, stream, PX_FL);
+
+			mResizeRequired = false;
+		}
+
+		const PxU32* bits = mDirtyShapeMap.getWords();
+		if (bits)
+		{
+			const PxU32 totalNumOfShapes = mMaxShapeId + 1;
+			const PxU32 numShapes = (totalNumOfShapes + 3) &(~3);
+
+			//make sure the dirty shape map cover x4 case and set those to invalid value
+			for (PxU32 i = totalNumOfShapes; i < numShapes; ++i)
+			{
+				mDirtyShapeMap.growAndSet(i);
+			}
+
+			// PT: ### bitmap iterator pattern
+			const PxU32 lastSetBit = mDirtyShapeMap.findLast();
+			for (PxU32 w = 0; w <= lastSetBit >> 5; ++w)
+			{
+				//b&=b-1 will clear the lowest set bit in b
+				for (PxU32 b = bits[w]; b; )
+				{
+					//dirtyId is the next bit that's set to 1!
+					const PxU32 dirtyId = PxU32(w << 5 | PxLowestSetBit(b));
+
+					void* hostPtr = mHostShapes.begin() + dirtyId;
+
+					PxgCopyManager::CopyDesc desc;
+					desc.source = reinterpret_cast<size_t>(getMappedDevicePtr(cudaContext, hostPtr));
+					desc.dest = reinterpret_cast<size_t>(reinterpret_cast<PxU8*>(mGpuShapesBuffer.getDevicePtr()) + dirtyId * sizeof(PxgShape));
+					desc.bytes = sizeof(PxgShape);
+
+					mDirtyShapeMap.reset(dirtyId);
+					//Now we loop to try and find adjacent bits that are set...
+					PxU32 currIdx = dirtyId + 1;
+					PxU32 groupSize = 1;
+					while (currIdx <= lastSetBit && mDirtyShapeMap.test(currIdx) && groupSize < maxGrouping)
+					{
+						groupSize++;
+						mDirtyShapeMap.reset(currIdx);
+						currIdx++;
+						desc.bytes += sizeof(PxgShape);
+					}
+
+					if (currIdx != (dirtyId + 1))
+					{
+						//get the word from the current bit
+						w = PxMin(currIdx, lastSetBit) >> 5;
+						//reload the world
+						b = bits[w]; //Set a 1 here to make sure the b &= (b-1) in the for loop doesn't remove the current bit we're interested in
+					}
+					else
+					{
+						b &= (b - 1);
+					}
+
+					copyManager.pushDeferredHtoD(desc);
+				}
+			}
+		}
+
+		mDirtyShapeMap.clear();
+	}
+
+	if (mHasShapeInstanceChanged)
+	{
+		//AD: mHasShapeInstanceChanged needs to persist because computeRigidsToShapes() needs to run if we use direct-API 
+		//    we lower the flag in PxgNarrowphaseCore::prepareGpuNarrowphase.
+
+		// AD: the resize of the GPU transform cache is inside PxgNarrowphaseCore::prepareGpuNarrowphase.
+		if (mTransformCacheResizeRequired)
+		{
+			PxU64 oldCapacity = mGpuShapesRemapTableBuffer.getSize();
+			mGpuShapesRemapTableBuffer.allocateCopyOldDataAsync(sizeof(PxNodeIndex)*mHostShapesRemapTable.capacity(), cudaContext, stream, PX_FL);
+			cudaContext->memsetD32Async(mGpuShapesRemapTableBuffer.getDevicePtr() + oldCapacity, 0xFFFFFFFF, (mGpuShapesRemapTableBuffer.getSize() - oldCapacity) / sizeof(PxU32), stream);
+
+			oldCapacity = mGpuRigidIndiceBuffer.getSize();
+			mGpuRigidIndiceBuffer.allocateCopyOldDataAsync(sizeof(PxNodeIndex) * mHostShapesRemapTable.capacity(), cudaContext, stream, PX_FL);
+			cudaContext->memsetD32Async(mGpuRigidIndiceBuffer.getDevicePtr() + oldCapacity, 0xFFFFFFFF, (mGpuRigidIndiceBuffer.getSize() - oldCapacity) / sizeof(PxU32), stream);
+
+			mGpuTempRigidIndiceBuffer.allocate(sizeof(PxNodeIndex) * mHostShapesRemapTable.capacity(), PX_FL);
+
+			oldCapacity = mGpuShapeIndiceBuffer.getSize();
+			mGpuShapeIndiceBuffer.allocateCopyOldDataAsync(sizeof(PxU32) * mHostShapeIdTable.capacity(), cudaContext, stream, PX_FL);
+			cudaContext->memsetD32Async(mGpuShapeIndiceBuffer.getDevicePtr() + oldCapacity, 0xFFFFFFFF, (mGpuShapeIndiceBuffer.getSize() - oldCapacity) / sizeof(PxU32), stream);
+
+			oldCapacity = mGpuUnsortedShapeIndicesBuffer.getSize();
+			mGpuUnsortedShapeIndicesBuffer.allocateCopyOldDataAsync(sizeof(PxU32) * mHostShapeIdTable.capacity(), cudaContext, stream, PX_FL);
+			cudaContext->memsetD32Async(mGpuUnsortedShapeIndicesBuffer.getDevicePtr() + oldCapacity, 0xFFFFFFFF, (mGpuUnsortedShapeIndicesBuffer.getSize() - oldCapacity) / sizeof(PxU32), stream);
+
+
+			mGpuTempRigidBitIndiceBuffer.allocate(sizeof(PxU32) * mHostShapeIdTable.capacity(), PX_FL);
+
+			oldCapacity = mGpuTransformCacheIdToActorTableBuffer.getSize();
+			mGpuTransformCacheIdToActorTableBuffer.allocateCopyOldDataAsync(sizeof(PxActor*) * mHostTransformCacheIdToActorTable.capacity(), cudaContext, stream, PX_FL);
+			cudaContext->memsetD32Async(mGpuTransformCacheIdToActorTableBuffer.getDevicePtr() + oldCapacity, 0, (mGpuTransformCacheIdToActorTableBuffer.getSize() - oldCapacity) / sizeof(PxActor*), stream);
+
+			mTransformCacheResizeRequired = false;
+		}
+
+		const PxU32 totalNumOfShapeInstances = mMaxTransformCacheID + 1;
+		const PxU32 numShapeInstances = (totalNumOfShapeInstances + 3) &(~3);
+
+		//make sure the dirty shape map cover x4 case and set those to invalid value
+		for (PxU32 i = totalNumOfShapeInstances; i < numShapeInstances; ++i)
+		{
+			if (!mHostShapesRemapTable[i].isStaticBody())
+			{
+				mDirtyTransformCacheMap.growAndSet(i);
+				mHostShapesRemapTable[i] = PxNodeIndex(PX_INVALID_NODE);
+				mHostShapeIdTable[i] = 0xffffffff;
+				mHostTransformCacheIdToActorTable[i] = NULL;
+			}
+		}
+
+		const PxU32* bits = mDirtyTransformCacheMap.getWords();
+
+		if (bits)
+		{
+			// PT: ### bitmap iterator pattern
+			const PxU32 lastSetBit = mDirtyTransformCacheMap.findLast();
+			for (PxU32 w = 0; w <= lastSetBit >> 5; ++w)
+			{
+				//b&=b-1 will clear the lowest set bit in b
+				for (PxU32 b = bits[w]; b; )
+				{
+					//dirtyId is the next bit that's set to 1!
+					const PxU32 dirtyId = PxU32(w << 5 | PxLowestSetBit(b));
+
+					void* hostRemapPtr = mHostShapesRemapTable.begin() + dirtyId;
+
+					void* hostShapeIdPtr = mHostShapeIdTable.begin() + dirtyId;
+
+					void* hostTransformCacheIdToActorPtr = mHostTransformCacheIdToActorTable.begin() + dirtyId;
+
+					PxgCopyManager::CopyDesc desc1;
+					desc1.source = reinterpret_cast<size_t>(getMappedDevicePtr(cudaContext, hostRemapPtr));
+					desc1.dest = reinterpret_cast<size_t>(reinterpret_cast<PxU8*>(mGpuShapesRemapTableBuffer.getDevicePtr()) + dirtyId * sizeof(PxNodeIndex));
+					desc1.bytes = sizeof(PxNodeIndex);
+
+					PxgCopyManager::CopyDesc desc2;
+					desc2.source = reinterpret_cast<size_t>(getMappedDevicePtr(cudaContext, hostRemapPtr));
+					desc2.dest = reinterpret_cast<size_t>(reinterpret_cast<PxU8*>(mGpuRigidIndiceBuffer.getDevicePtr()) + dirtyId * sizeof(PxNodeIndex));
+					desc2.bytes = sizeof(PxNodeIndex);
+
+					PxgCopyManager::CopyDesc desc3;
+					desc3.source = reinterpret_cast<size_t>(getMappedDevicePtr(cudaContext, hostShapeIdPtr));
+					desc3.dest = reinterpret_cast<size_t>(reinterpret_cast<PxU8*>(mGpuUnsortedShapeIndicesBuffer.getDevicePtr()) + dirtyId * sizeof(PxU32));
+					desc3.bytes = sizeof(PxU32);
+
+					PxgCopyManager::CopyDesc desc4;
+					desc4.source = reinterpret_cast<size_t>(getMappedDevicePtr(cudaContext, hostTransformCacheIdToActorPtr));
+					desc4.dest = reinterpret_cast<size_t>(reinterpret_cast<PxU8*>(mGpuTransformCacheIdToActorTableBuffer.getDevicePtr()) + dirtyId * sizeof(PxActor*));
+					desc4.bytes = sizeof(PxActor*);
+
+					mDirtyTransformCacheMap.reset(dirtyId);
+					//Now we loop to try and find adjacent bits that are set...
+					PxU32 currIdx = dirtyId + 1;
+					PxU32 groupSize = 1;
+					while (currIdx <= lastSetBit && mDirtyTransformCacheMap.test(currIdx) && groupSize < maxGrouping)
+					{
+						groupSize++;
+						mDirtyTransformCacheMap.reset(currIdx);
+						currIdx++;
+						desc1.bytes += sizeof(PxNodeIndex);
+						desc2.bytes += sizeof(PxNodeIndex);
+						desc3.bytes += sizeof(PxU32);
+						desc4.bytes += sizeof(PxActor*);
+					}
+
+					if (currIdx != (dirtyId + 1))
+					{
+						//get the word from the current bit
+						w = PxMin(currIdx, lastSetBit) >> 5;
+						//reload the world
+						b = bits[w]; //Set a 1 here to make sure the b &= (b-1) in the for loop doesn't remove the current bit we're interested in
+					}
+					else
+					{
+						b &= (b - 1);
+					}
+
+					copyManager.pushDeferredHtoD(desc1);
+					copyManager.pushDeferredHtoD(desc2);
+					copyManager.pushDeferredHtoD(desc3);
+					copyManager.pushDeferredHtoD(desc4);
+				}
+			}
+		}
+		mDirtyTransformCacheMap.clear();
+	}
+}
+
+void PxgShapeManager::updateShapeMaterial(const PxU32 materialIndex, const PxU32 id)
+{
+	PX_ASSERT(id < mHostShapes.size());
+	mHostShapes[id].materialIndex = materialIndex;
+	mDirtyShapeMap.growAndSet(id);
+	mHasShapeChanged = true;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////
+
+PxgMaterialManager::PxgMaterialManager(PxgHeapMemoryAllocatorManager* heapManager, const PxU32 elemSize) :
+	mGpuMaterialBuffer(heapManager, PxsHeapStats::eNARROWPHASE),
+	mHeapManager(heapManager),
+	mHostMaterial(PxVirtualAllocator(heapManager->mMappedMemoryAllocators, PxsHeapStats::eNARROWPHASE))
+{
+	const PxU32 originalSize = elemSize * 128;
+	mHostMaterial.forceSize_Unsafe(0);
+	mHostMaterial.reserve(originalSize);
+	mHostMaterial.forceSize_Unsafe(originalSize);
+
+	mGpuMaterialBuffer.allocate(originalSize, PX_FL);
+	mResizeRequired = false;
+}
+
+PxU32 PxgMaterialManager::registerMaterial(const PxU8* materialData, const PxU32 elemSize)
+{
+	const PxU32 shapeId = mIdPool.getNewID();
+	PxU32 capacity = mHostMaterial.capacity() / elemSize;
+	
+	if (shapeId >= capacity)
+	{
+		capacity = PxMax(capacity * 2 + 1, shapeId + 1);
+		mHostMaterial.resize(capacity * elemSize);
+		mResizeRequired = true;
+	}
+
+	PxU8* destPtr = mHostMaterial.begin() + shapeId * elemSize;
+	PxMemCopy(destPtr, materialData, elemSize);
+		
+	mDirtyMaterialMap.growAndSet(shapeId);
+
+	return shapeId;
+}
+
+void PxgMaterialManager::unregisterMaterial(const PxU32 id)
+{
+	mDirtyMaterialMap.reset(id);
+	mIdPool.deferredFreeID(id);
+}
+
+void PxgMaterialManager::scheduleCopyHtoD(PxgCopyManager& copyManager, PxCudaContext* cudaContext, 
+	CUstream stream, const PxU32 elemSize)
+{
+	if (mResizeRequired)
+	{
+		mGpuMaterialBuffer.allocateCopyOldDataAsync(mHostMaterial.capacity(), cudaContext, stream, PX_FL);
+		mResizeRequired = false;
+	}
+	const PxU32* bits = mDirtyMaterialMap.getWords();
+
+	const PxU32 maxGrouping = 16;
+
+	if (bits)
+	{
+		// PT: ### bitmap iterator pattern
+		const PxU32 lastSetBit = mDirtyMaterialMap.findLast();
+		for (PxU32 w = 0; w <= lastSetBit >> 5; ++w)
+		{
+			//b&=b-1 will clear the lowest set bit in b
+			for (PxU32 b = bits[w]; b; )
+			{
+				//dirtyId is the next bit that's set to 1!
+				const PxU32 dirtyId = PxU32(w << 5 | PxLowestSetBit(b));
+
+				void* hostPtr = mHostMaterial.begin() + dirtyId * elemSize;
+
+				PxgCopyManager::CopyDesc desc;
+				desc.source = reinterpret_cast<size_t>(getMappedDevicePtr(cudaContext, hostPtr));
+				desc.dest = reinterpret_cast<size_t>(reinterpret_cast<PxU8*>(mGpuMaterialBuffer.getDevicePtr()) + dirtyId * elemSize);
+				desc.bytes = elemSize;
+
+				mDirtyMaterialMap.reset(dirtyId);
+
+				//Now we loop to try and find adjacent bits that are set...
+				PxU32 currIdx = dirtyId + 1;
+				PxU32 groupSize = 1;
+				while (currIdx <= lastSetBit && mDirtyMaterialMap.test(currIdx) && (groupSize < maxGrouping))
+				{
+					groupSize++;
+					mDirtyMaterialMap.reset(currIdx);
+					currIdx++;
+					desc.bytes += elemSize;
+				}
+
+				if (currIdx != (dirtyId + 1))
+				{
+					//get the word from the current bit
+					w = PxMin(currIdx, lastSetBit) >> 5;
+					//reload the world
+					b = bits[w]; //Set a 1 here to make sure the b &= (b-1) in the for loop doesn't remove the current bit we're interested in
+				}
+				else
+				{
+					b &= (b - 1);
+				}
+
+				copyManager.pushDeferredHtoD(desc);
+			}
+		}
+	}
+
+	mDirtyMaterialMap.clear();
+}
+
+void PxgMaterialManager::updateMaterial(const PxU8* materialCore, const PxU32 elemSize, const PxU32 id)
+{
+	PX_ASSERT(id < mHostMaterial.size());
+	PxU8* destptr = reinterpret_cast<PxU8*>(mHostMaterial.begin() + id * elemSize);
+	PxMemCopy(destptr, materialCore, elemSize);
+	//mHostMaterial[id] = materialCore;
+	mDirtyMaterialMap.growAndSet(id);
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////
+	
+PxgFEMMaterialManager::PxgFEMMaterialManager(PxgHeapMemoryAllocatorManager* heapManager, const PxU32 elemSize) :
+	PxgMaterialManager(heapManager, elemSize)
+{
+
+}
+
+void PxgFEMMaterialManager::scheduleCopyHtoD(PxgCopyManager& copyManager, PxCudaContext* cudaContext,
+	CUstream stream, const PxU32 elemSize)
+{
+	if (mResizeRequired)
+	{
+		mGpuMaterialBuffer.allocateCopyOldDataAsync(mHostMaterial.capacity(), cudaContext, stream, PX_FL);
+		mResizeRequired = false;
+	}
+
+
+	const PxU32* bits = mDirtyMaterialMap.getWords();
+
+	const PxU32 maxGrouping = 16;
+
+	if (bits)
+	{
+		// PT: ### bitmap iterator pattern
+		const PxU32 lastSetBit = mDirtyMaterialMap.findLast();
+		for (PxU32 w = 0; w <= lastSetBit >> 5; ++w)
+		{
+			//b&=b-1 will clear the lowest set bit in b
+			for (PxU32 b = bits[w]; b; )
+			{
+				//dirtyId is the next bit that's set to 1!
+				const PxU32 dirtyId = PxU32(w << 5 | PxLowestSetBit(b));
+
+				void* hostPtr = mHostMaterial.begin() + dirtyId * elemSize;
+
+				PxgCopyManager::CopyDesc desc;
+				desc.source = reinterpret_cast<size_t>(getMappedDevicePtr(cudaContext, hostPtr));
+				desc.dest = reinterpret_cast<size_t>(reinterpret_cast<PxU8*>(mGpuMaterialBuffer.getDevicePtr()) + dirtyId * elemSize);
+				desc.bytes = elemSize;
+
+				mDirtyMaterialMap.reset(dirtyId);
+
+				//Now we loop to try and find adjacent bits that are set...
+				PxU32 currIdx = dirtyId + 1;
+				PxU32 groupSize = 1;
+				while (currIdx <= lastSetBit && mDirtyMaterialMap.test(currIdx) && (groupSize < maxGrouping))
+				{
+					groupSize++;
+					mDirtyMaterialMap.reset(currIdx);
+					currIdx++;
+					desc.bytes += elemSize;
+				}
+
+				if (currIdx != (dirtyId + 1))
+				{
+					//get the word from the current bit
+					w = PxMin(currIdx, lastSetBit) >> 5;
+					//reload the world
+					b = bits[w]; //Set a 1 here to make sure the b &= (b-1) in the for loop doesn't remove the current bit we're interested in
+				}
+				else
+				{
+					b &= (b - 1);
+				}
+
+				copyManager.pushDeferredHtoD(desc);
+			}
+		}
+	}
+
+	mDirtyMaterialMap.clear();
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////
+	
+PxgFEMSoftBodyMaterialManager::PxgFEMSoftBodyMaterialManager(PxgHeapMemoryAllocatorManager* heapManager) :
+	PxgFEMMaterialManager(heapManager, sizeof(PxsDeformableVolumeMaterialData))
+{
+}