// Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved. // Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. // Copyright (c) 2001-2004 NovodeX AG. All rights reserved. #include "PxgContext.h" #include "cudamanager/PxCudaContext.h" #include "common/PxProfileZone.h" #include "PxgIslandContext.h" #include "PxgSolverCore.h" #include "PxvSimStats.h" #include "DyConstraintPrep.h" #include "PxgArticulationCore.h" #include "PxgSoftBodyCore.h" #include "PxgFEMClothCore.h" #include "DyDeformableSurface.h" #include "DyDeformableVolume.h" #include "PxgSimulationCore.h" #include "PxgPBDParticleSystemCore.h" #include "DyIslandManager.h" #include "CmFlushPool.h" // PT: TODO: this doesn't compile anymore these days //#undef PXG_CONTACT_VALIDATION //#define PXG_CONTACT_VALIDATION 1 namespace physx { #if PXG_CONTACT_VALIDATION #pragma warning(push) #pragma warning(disable:4100) static bool validateContactPairs(PxU32 startIndex, PxU32 endIndex, PxU32* uniqueIds, PxU32* npIds, PxsContactManagerOutputIterator& outputIter, PxU8* basePatchPointer, PxU8* baseContactPointer) { for (PxU32 a = startIndex; a < endIndex; ++a) { PxU32 uniqueId = uniqueIds[a]; PxU32 npId = npIds[uniqueId]; PxsContactManagerOutput& output = outputIter.getContactManagerOutput(npId); PxContactPatch* contactPatches = reinterpret_cast(output.contactPatches); PxContact* contacts = reinterpret_cast(output.contactPoints); PX_ASSERT((contactPatches - reinterpret_cast(basePatchPointer)) < 655360); PX_ASSERT((contacts - reinterpret_cast(baseContactPointer)) < (3145728)); PX_ASSERT(output.nbPatches != 0); PxU32 totalContact = 0; for (PxU32 i = 0; i < output.nbPatches; ++i) { PxContactPatch& patch = contactPatches[i]; PX_ASSERT(patch.startContactIndex < output.nbContacts); PX_ASSERT(patch.normal.isNormalized()); totalContact += patch.nbContacts; } for (PxU32 i = 0; i < output.nbContacts; ++i) { PX_ASSERT(contacts[i].contact.isFinite()); PX_ASSERT(PxIsFinite(contacts[i].separation)); } PX_ASSERT(totalContact == output.nbContacts); } return true; } static bool validateConstraintPairs(PxU32 startIndex, PxU32 endIndex, PxU32* uniqueIds, PxU32* npIds, PxgConstraintPrePrep* constraintPrePrep, PxU32* solverBodyIndices) { for (PxU32 a = startIndex; a < endIndex; ++a) { PxU32 uniqueId = uniqueIds[a]; PxU32 npId = npIds[uniqueId]; PxgConstraintPrePrep& prePrep = constraintPrePrep[npId]; PX_ASSERT(prePrep.mNodeIndexA.index() == PX_INVALID_NODE || prePrep.mNodeIndexA.index() < 16000); PX_ASSERT(prePrep.mNodeIndexB.index() == PX_INVALID_NODE || prePrep.mNodeIndexB.index() < 16000); PX_ASSERT(prePrep.mNodeIndexA.index() == PX_INVALID_NODE || solverBodyIndices[prePrep.mNodeIndexA.index()] < 16000); PX_ASSERT(prePrep.mNodeIndexB.index() == PX_INVALID_NODE || solverBodyIndices[prePrep.mNodeIndexB.index()] < 16000); } return true; } #pragma warning(pop) #endif class PxgBatchArticulationStaticConstraintPrePrepTask : public Cm::Task { PX_NOCOPY(PxgBatchArticulationStaticConstraintPrePrepTask) private: PxU32* mStaticContactIndices; PxU32* mStaticJointIndices; PxU32* mStaticContactCounts; PxU32* mStaticJointCounts; PxU32* mSelfContactIndices; PxU32* mSelfJointIndices; PxU32* mSelfContactCounts; PxU32* mSelfJointCounts; const PxU32 mStartIndex; const PxU32 mEndIndex; PxNodeIndex* mNodeIndices; PxgBodySimManager& mBodyManager; const PxU32 mNbArticulations; public: static const PxU32 NbPerTask = 512; PxgBatchArticulationStaticConstraintPrePrepTask(PxU64 context, PxU32* staticContactIndices, PxU32* staticJointIndices, PxU32* staticContactCounts, PxU32* staticJointCounts, PxU32* selfContactIndices, PxU32* selfJointIndices, PxU32* selfContactCounts, PxU32* selfJointCounts, PxU32 startIndex, PxU32 endIndex, PxNodeIndex* nodeIndices, PxgBodySimManager& bodyManager, PxU32 nbArticulations) : Cm::Task(context), mStaticContactIndices(staticContactIndices), mStaticJointIndices(staticJointIndices), mStaticContactCounts(staticContactCounts), mStaticJointCounts(staticJointCounts), mSelfContactIndices(selfContactIndices), mSelfJointIndices(selfJointIndices), mSelfContactCounts(selfContactCounts), mSelfJointCounts(selfJointCounts), mStartIndex(startIndex), mEndIndex(endIndex), mNodeIndices(nodeIndices), mBodyManager(bodyManager), mNbArticulations(nbArticulations) { } virtual const char* getName() const PX_OVERRIDE PX_FINAL { return "PxgBatchArticulationStaticConstraintPrePrepTask"; } virtual void runInternal() PX_OVERRIDE PX_FINAL { const PxU32 stride = mNbArticulations; //const PxU32 blockCount = (mNbArticulations + 31)/32; for (PxU32 i = mStartIndex; i < mEndIndex; ++i) { const PxU32 nodeIndex = mNodeIndices[i].index(); PxgStaticConstraints& staticConstraints = mBodyManager.mStaticConstraints[nodeIndex]; const PxU32 staticContactCount = staticConstraints.mStaticContacts.size(); PxgStaticConstraint* uniqueIds = staticConstraints.mStaticContacts.begin(); mStaticContactCounts[i] = staticContactCount; for (PxU32 a = 0, offset = i; a < staticContactCount; ++a, offset += stride) { mStaticContactIndices[offset] = uniqueIds[a].uniqueId; } const PxU32 staticJointCount = staticConstraints.mStaticJoints.size(); uniqueIds = staticConstraints.mStaticJoints.begin(); mStaticJointCounts[i] = staticJointCount; for (PxU32 a = 0, offset = i; a < staticJointCount; ++a, offset += stride) { mStaticJointIndices[offset] = uniqueIds[a].uniqueId; } const PxU32 articIndex = mBodyManager.mNodeToRemapMap[nodeIndex]; PxgArticulationSelfConstraints& selfConstraints = mBodyManager.mArticulationSelfConstraints[articIndex]; const PxU32 selfContactCount = selfConstraints.mSelfContacts.size(); PxgSelfConstraint* selfIds = selfConstraints.mSelfContacts.begin(); mSelfContactCounts[i] = selfContactCount; for (PxU32 a = 0, offset = i; a < selfContactCount; ++a, offset += stride) { mSelfContactIndices[offset] = selfIds[a].uniqueId; } const PxU32 selfJointCount = selfConstraints.mSelfJoints.size(); selfIds = selfConstraints.mSelfJoints.begin(); mSelfJointCounts[i] = selfJointCount; for (PxU32 a = 0, offset = i; a < selfJointCount; ++a, offset += stride) { mSelfJointIndices[offset] = selfIds[a].uniqueId; } } } }; class PxgBatchRigidStaticConstraintPrePrepTask : public Cm::Task { PX_NOCOPY(PxgBatchRigidStaticConstraintPrePrepTask) private: PxU32* mStaticContactIndices; PxU32* mStaticJointIndices; PxU32* mStaticContactCounts; PxU32* mStaticJointCounts; const PxU32 mStartIndex; const PxU32 mEndIndex; PxNodeIndex* mNodeIndices; PxgBodySimManager& mBodyManager; const PxU32 mNbBodies; public: static const PxU32 NbPerTask = 256; PxgBatchRigidStaticConstraintPrePrepTask(PxU64 context, PxU32* staticContactIndices, PxU32* staticJointIndices, PxU32* staticContactCounts, PxU32* staticJointCounts, PxU32 startIndex, PxU32 endIndex, PxNodeIndex* nodeIndices, PxgBodySimManager& bodyManager, PxU32 nbBodies) : Cm::Task(context), mStaticContactIndices(staticContactIndices), mStaticJointIndices(staticJointIndices), mStaticContactCounts(staticContactCounts), mStaticJointCounts(staticJointCounts), mStartIndex(startIndex), mEndIndex(endIndex), mNodeIndices(nodeIndices), mBodyManager(bodyManager), mNbBodies(nbBodies) { } virtual const char* getName() const PX_OVERRIDE PX_FINAL { return "PxgBatchRigidStaticConstraintPrePrepTask"; } virtual void runInternal() PX_OVERRIDE PX_FINAL { const PxU32 stride = mNbBodies; for (PxU32 i = mStartIndex; i < mEndIndex; ++i) { const PxU32 nodeIndex = mNodeIndices[i].index(); PxgStaticConstraints& staticConstraints = mBodyManager.mStaticConstraints[nodeIndex]; const PxU32 staticContactCount = staticConstraints.mStaticContacts.size(); PxgStaticConstraint* uniqueIds = staticConstraints.mStaticContacts.begin(); mStaticContactCounts[i] = staticContactCount; for (PxU32 a = 0, offset = i; a < staticContactCount; ++a, offset += stride) { mStaticContactIndices[offset] = uniqueIds[a].uniqueId; } const PxU32 staticJointCount = staticConstraints.mStaticJoints.size(); uniqueIds = staticConstraints.mStaticJoints.begin(); mStaticJointCounts[i] = staticJointCount; for (PxU32 a = 0, offset = i; a < staticJointCount; ++a, offset += stride) { mStaticJointIndices[offset] = uniqueIds[a].uniqueId; } } } }; void PxgCpuConstraintPrePrepTask::runInternal() { PX_PROFILE_ZONE("GpuDynamics.PxgCpuJointPrePrepTask", 0); PxU32 currentEdgeIndex = 0; for (PxU32 a = 0; a < mNumBatches; ++a) { PxU32 descStride = PxMin(mNumEdges - currentEdgeIndex, PXG_BATCH_SIZE); PxgConstraintBatchHeader& batchHeader = mBatchHeaders[a]; batchHeader.constraintType = PxgSolverConstraintDesc::eCONSTRAINT_1D; batchHeader.mDescStride = PxU16(descStride); batchHeader.mConstraintBatchIndex = mConstraintBlockStartIndex + a; batchHeader.mStartPartitionIndex = mUniqueIdStartIndex + a * PXG_BATCH_SIZE; batchHeader.mask = 0xFFFFFFFF; //Unused #if PXG_CONTACT_VALIDATION validateConstraintPairs(a, a + descStride, mEdgeIds + a, mNpIds, mConstraintPrePrep, mSolverBodyIndices); #endif currentEdgeIndex += descStride; } for (PxU32 a = 0; a < mNumEdges; ++a) { mPinnedEdgeIds[mUniqueIdStartIndex + a] = mEdgeIds[a + mStartEdgeIdx]; } //PxMemCopy(mPinnedEdgeIds + mUniqueIdStartIndex, mEdgeIds, sizeof(PxU32) * mNumEdges); } void PxgCpuArtiConstraintPrePrepTask::runInternal() { PX_PROFILE_ZONE("GpuDynamics.PxgCpuArtiJointPrePrepTask", 0); PxU32 currentEdgeIndex = 0; for (PxU32 a = 0; a < mNumBatches; ++a) { PxgConstraintBatchHeader& batchHeader = mBatchHeaders[a]; PxU32 descStride = PxMin(mNumEdges - currentEdgeIndex, PXG_BATCH_SIZE); batchHeader.constraintType = PxU16(mIsContact ? PxgSolverConstraintDesc::eARTICULATION_CONTACT : PxgSolverConstraintDesc::eARTICULATION_CONSTRAINT_1D); batchHeader.mDescStride = PxU16(descStride); batchHeader.mConstraintBatchIndex = mConstraintBlockStartIndex + a; batchHeader.mStartPartitionIndex = mUniqueIdStartIndex + a * PXG_BATCH_SIZE; batchHeader.mask = 0xFFFFFFFF; //Unused #if PXG_CONTACT_VALIDATION validateConstraintPairs(a, a + descStride, mEdgeIds + a, mNpIds, mConstraintPrePrep, mSolverBodyIndices); #endif currentEdgeIndex += descStride; } for (PxU32 a = 0; a < mNumEdges; ++a) { mPinnedEdgeIds[mUniqueIdStartIndex + a] = mEdgeIds[a + mStartEdgeIdx]; } //PxMemCopy(mPinnedEdgeIds + mUniqueIdStartIndex, mEdgeIds, sizeof(PxU32) * mNumEdges); } void PxgCpuPrepTask::runInternal() { mContext.doConstraintPrePrepCommon(mCont); } PxgGpuContext::PxgGpuContext(Cm::FlushPool& flushPool, IG::SimpleIslandManager& islandManager, PxU32 maxNumPartitions, PxU32 maxNumStaticPartitions, bool enableStabilization, bool useEnhancedDeterminism, PxReal maxBiasCoefficient, PxvSimStats& simStats, PxgHeapMemoryAllocatorManager* heapMemoryManager, PxReal lengthScale, bool enableDirectGPUAPI, PxU64 contextID, bool isResidualReportingEnabled, bool isTGS) : Dy::Context(islandManager, heapMemoryManager->mMappedMemoryAllocators, simStats, enableStabilization, useEnhancedDeterminism, maxBiasCoefficient, lengthScale, contextID, isResidualReportingEnabled), mTotalEdges(0), mTotalPreviousEdges(0), mFlushPool(flushPool), mSolvedThisFrame(false), mIncrementalPartition(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators), maxNumPartitions, contextID), mActiveNodeIndex(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators)), mSolverBodyPool(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators)), mBody2WorldPool(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators)), mLinkAndJointAndRootStateDataPool(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators)), mArticulationSleepDataPool(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators)), mInternalResidualPerArticulationVelIter(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators)), mInternalResidualPerArticulationPosIter(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators)), m1dConstraintBatchIndices(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators)), mContactConstraintBatchIndices(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators)), mArti1dConstraintBatchIndices(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators)), mArtiContactConstraintBatchIndices(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators)), mConstraintsPerPartition(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators)), mArtiConstraintsPerPartition(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators)), mSolverBodyDataPool(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators)), mSolverBodySleepDataPool(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators)), mSolverTxIDataPool(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators)), mCachedPositionIterations(0), mCachedVelocityIterations(0), mArtiStaticContactCounts(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators)), mArtiStaticJointCounts(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators)), mArtiStaticContactIndices(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators)), mArtiStaticJointIndices(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators)), mArtiSelfContactCounts(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators)), mArtiSelfJointCounts(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators)), mArtiSelfContactIndices(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators)), mArtiSelfJointIndices(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators)), mRigidStaticContactCounts(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators)), mRigidStaticJointCounts(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators)), mRigidStaticContactIndices(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators)), mRigidStaticJointIndices(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators)), mNodeIndicesStagingBuffer(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators)), mIslandIds(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators)), mIslandStaticTouchCounts(PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators)), mIsTGS(isTGS), mIsExternalForcesEveryTgsIterationEnabled(false), mEnableDirectGPUAPI(enableDirectGPUAPI), mRecomputeArticulationBlockFormat(false), mEnforceConstraintWriteBackToHostCopy(false), mPreIntegrationTask (*this), mPrepTask (*this), mGpuPrePrepTask (*this), mGpuIntegrationTask (*this), mGpuTask (*this), mPostSolveTask (*this) { mGpuArticulationCore = NULL; mGpuBp = NULL; mGpuNpCore = NULL; mGpuSoftBodyCore = NULL; mGpuFEMClothCore = NULL; mGpuSimulationCore = NULL; mGpuSolverCore = NULL; mGpuPBDParticleSystemCore = NULL; mMaxNumStaticPartitions = maxNumStaticPartitions; } PxgGpuContext::~PxgGpuContext() { mGpuSolverCore->acquireContext(); PX_DELETE(mPinnedMemoryAllocator); PX_DELETE(mContactStreamAllocators[0]); PX_DELETE(mContactStreamAllocators[1]); PX_DELETE(mPatchStreamAllocators[0]); PX_DELETE(mPatchStreamAllocators[1]); PX_DELETE(mForceStreamAllocator); PX_DELETE(mFrictionPatchStreamAllocator); mGpuSolverCore->releaseStreams(); mGpuSolverCore->releaseContext(); PX_DELETE(mThresholdStream); PX_DELETE(mForceChangedThresholdStream); PX_DELETE(mGpuArticulationCore); PX_DELETE(mGpuSolverCore); } PxgSimulationController* PxgGpuContext::getSimulationController() { return static_cast(mSimulationController); } void PxgGpuContext::setSimulationController(PxsSimulationController* simulationController) { mSimulationController = simulationController; } PxgParticleSystemCore* PxgGpuContext::getGpuParticleSystemCore() { return mGpuPBDParticleSystemCore; } void PxgGpuContext::mergeResults() { //Flip the current contact stream mCurrentContactStream = 1 - mCurrentContactStream; mContactStreamPool.mDataStream = mContactStreamAllocators[mCurrentContactStream]->mStart; mPatchStreamPool.mDataStream = mPatchStreamAllocators[mCurrentContactStream]->mStart; mContactStreamPool.mSharedDataIndex = 0; mPatchStreamPool.mSharedDataIndex = 0; mForceStreamPool.mSharedDataIndex = 0; mFrictionPatchStreamPool.mSharedDataIndex = 0; mContactStreamPool.mSharedDataIndexGPU = 0; mPatchStreamPool.mSharedDataIndexGPU = 0; mForceStreamPool.mSharedDataIndexGPU = 0; mFrictionPatchStreamPool.mSharedDataIndexGPU = 0; } void PxgGpuContext::getDataStreamBase(void*& contactStreamBase, void*& patchStreamBase, void*& forceAndIndiceStreamBase) { return mGpuSolverCore->getDataStreamBase(contactStreamBase, patchStreamBase, forceAndIndiceStreamBase); } //this is the pre-prepare code for block format joints loaded from the non-block format joints void PxgGpuContext::doConstraintJointBlockPrePrepGPU() { //DMA the joint pre-prepare data which constructs in CPU(not D6Joint) to GPU // AD: This is not needed for direct-GPU API but downstream things are getting really complex and I cannot // figure out which count I need to adjust to avoid crashing. //if (!mEnableDirectGPUAPI) { PxgJointManager& jointManager = getSimulationController()->getJointManager(); if (jointManager.getCpuNbRigidConstraints() > 0) { mGpuSolverCore->gpuMemDMAUpJointData(jointManager.getCpuRigidConstraintData(), jointManager.getCpuRigidConstraintRows(), jointManager.getCpuRigidConstraintData().size(), jointManager.getGpuNbRigidConstraints(), PxU32(jointManager.mNbCpuRigidConstraintRows)); } if (jointManager.getCpuNbArtiConstraints() > 0) { mGpuSolverCore->gpuMemDMAUpArtiJointData(jointManager.getCpuArtiConstraintData(), jointManager.getCpuArtiConstraintRows(), jointManager.getCpuArtiConstraintData().size(), jointManager.getGpuNbArtiConstraints(), PxU32(jointManager.mNbCpuArtiConstraintRows)); } } // maybe this is also not needed if we have direct-GPU? mGpuSolverCore->jointConstraintBlockPrePrepParallel(mNumConstraintBatches + mNumRigidStaticConstraintBatches + mNumArticConstraintBatches + mNumArtiStaticConstraintBatches + mNumArtiSelfConstraintBatches); } void PxgGpuContext::doStaticArticulationConstraintPrePrep(physx::PxBaseTask* continuation, const PxU32 articulationConstraintBatchIndex, const PxU32 articulationContactBatchIndex) { PxgBodySimManager& bodyManager = getSimulationController()->getBodySimManager(); PxgIslandContext& island = mIslandContextPool[0]; const PxU32 articulationStartIndex = island.mBodyStartIndex + island.mBodyCount; PxNodeIndex* nodeIndices = mActiveNodeIndex.begin() + articulationStartIndex; //KS - TODO - revisit this and make it work with batching. Currently, it is disabled! mArtiStaticConstraintBatchOffset = articulationConstraintBatchIndex; mArtiStaticContactBatchOffset = articulationContactBatchIndex; PX_PROFILE_ZONE("Articulation Static constraint", 0); mArtiStaticContactCounts.resize(mArticulationCount); mArtiStaticJointCounts.resize(mArticulationCount); mArtiSelfContactCounts.resize(mArticulationCount); mArtiSelfJointCounts.resize(mArticulationCount); PxU32 maxArtiStaticContacts = bodyManager.mMaxStaticArticContacts; PxU32 maxArtiStaticJoints = bodyManager.mMaxStaticArticJoints; PxU32 maxArtiSelfContacts = bodyManager.mMaxSelfArticContacts; PxU32 maxArtiSelfJoints = bodyManager.mMaxSelfArticJoints; mArtiStaticContactIndices.resize(maxArtiStaticContacts * mArticulationCount); mArtiStaticJointIndices.resize(maxArtiStaticJoints * mArticulationCount); mArtiSelfContactIndices.resize(maxArtiSelfContacts * mArticulationCount); mArtiSelfJointIndices.resize(maxArtiSelfJoints * mArticulationCount); for (PxU32 i = 0; i < mArticulationCount; i += PxgBatchArticulationStaticConstraintPrePrepTask::NbPerTask) { PxU32 endIndex = PxMin(i + PxgBatchArticulationStaticConstraintPrePrepTask::NbPerTask, mArticulationCount); PxgBatchArticulationStaticConstraintPrePrepTask* task = PX_PLACEMENT_NEW(mFlushPool.allocate(sizeof(PxgBatchArticulationStaticConstraintPrePrepTask)), PxgBatchArticulationStaticConstraintPrePrepTask) (0, mArtiStaticContactIndices.begin(), mArtiStaticJointIndices.begin(), mArtiStaticContactCounts.begin(), mArtiStaticJointCounts.begin(), mArtiSelfContactIndices.begin(), mArtiSelfJointIndices.begin(), mArtiSelfContactCounts.begin(), mArtiSelfJointCounts.begin(), i, endIndex, nodeIndices, bodyManager, mArticulationCount); task->setContinuation(continuation); task->removeReference(); } } void PxgGpuContext::doStaticRigidConstraintPrePrep(physx::PxBaseTask* continuation) { PX_PROFILE_ZONE("Rigid Static constraint", 0); PxgBodySimManager& bodyManager = getSimulationController()->getBodySimManager(); PxgIslandContext& island = mIslandContextPool[0]; const PxU32 bodyStartIndex = island.mBodyStartIndex; PxNodeIndex* nodeIndices = mActiveNodeIndex.begin() + bodyStartIndex; mRigidStaticContactCounts.resize(mBodyCount); mRigidStaticJointCounts.resize(mBodyCount); PxU32 maxRigidStaticContacts = bodyManager.mMaxStaticRBContacts; PxU32 maxRigidStaticJoints = bodyManager.mMaxStaticRBJoints; mRigidStaticContactIndices.resize(maxRigidStaticContacts * mBodyCount); mRigidStaticJointIndices.resize(maxRigidStaticJoints * mBodyCount); for (PxU32 i = 0; i < mBodyCount; i += PxgBatchArticulationStaticConstraintPrePrepTask::NbPerTask) { PxU32 endIndex = PxMin(i + PxgBatchArticulationStaticConstraintPrePrepTask::NbPerTask, mBodyCount); PxgBatchRigidStaticConstraintPrePrepTask* task = PX_PLACEMENT_NEW(mFlushPool.allocate(sizeof(PxgBatchRigidStaticConstraintPrePrepTask)), PxgBatchRigidStaticConstraintPrePrepTask) (0, mRigidStaticContactIndices.begin(), mRigidStaticJointIndices.begin(), mRigidStaticContactCounts.begin(), mRigidStaticJointCounts.begin(), i, endIndex, nodeIndices, bodyManager, mBodyCount); task->setContinuation(continuation); task->removeReference(); } } void PxgGpuContext::doConstraintSolveGPU(PxU32 maxNodes, PxBitMapPinned& changedHandleMap) { /** * Things to do in here: * (1) Solve on GPU * (2) Write-back on GPU * (2) Integration on GPU (transforms are now on GPU solver body data so might as well use them) */ mGpuArticulationCore->syncStream(); mConstraintPositionIterResidualPoolGpu.resize(mConstraintWriteBackPool.size()); mGpuSolverCore->solveContactMultiBlockParallel(mIslandContextPool, mNumIslandContextPool, mIncrementalPartition.getCombinedSlabMaxNbPartitions(), mConstraintsPerPartition, mArtiConstraintsPerPartition, mGravity, mConstraintPositionIterResidualPoolGpu.begin(), mConstraintPositionIterResidualPoolGpu.size(), &mTotalContactError.mPositionIterationErrorAccumulator, mArticulationContactErrorPosIter, mInternalResidualPerArticulationPosIter); mContactErrorPosIter = &mTotalContactError.mPositionIterationErrorAccumulator; if (mHasForceThresholds) mGpuSolverCore->accumulatedForceThresholdStream(maxNodes + 1); const PxU32 offset = 1 + mKinematicCount; //KS - todo - use separate streams. In addition, read number of threshold streams before DMAing back data mGpuSolverCore->gpuMemDMAbackSolverData(mForceStreamPool.mDataStream, mForceStreamPool.mDataStreamSize - mForceStreamPool.mSharedDataIndex, (PxU32)mForceStreamPool.mSharedDataIndex, (PxU32)mForceStreamPool.mSharedDataIndexGPU, mForceChangedThresholdStream->begin(), mIncrementalPartition.hasForceThresholds(), mConstraintWriteBackPool.begin(), mConstraintWriteBackPool.size(), (!mEnableDirectGPUAPI || getSimulationController()->getEnableOVDCollisionReadback()), mContactErrorVelIter); mGpuSolverCore->integrateCoreParallel(offset, mSolverBodyPool.size()); mGpuArticulationCore->updateBodies(mDt, !mIsTGS, mEnableDirectGPUAPI); mSimulationController->update(changedHandleMap); if (isResidualReportingEnabled()) mArticulationContactErrorVelIter.resize(1); if (!mEnableDirectGPUAPI || getSimulationController()->getEnableOVDReadback()) { mGpuArticulationCore->gpuMemDMAbackArticulation(mLinkAndJointAndRootStateDataPool, mArticulationSleepDataPool, mInternalResidualPerArticulationVelIter, mArticulationContactErrorVelIter); } mGpuSolverCore->gpuMemDMAbackSolverBodies(reinterpret_cast(mSolverBodyPool.begin()), mSolverBodyPool.size(), mBody2WorldPool, mSolverBodySleepDataPool, mEnableDirectGPUAPI && (!getSimulationController()->getEnableOVDReadback())); } class PxgPostSolveWorkerTask : public Cm::Task { PxNodeIndex* mNodeIndices; PxAlignedTransform* mBodyToWorldPool; PxgSolverBodySleepData* mSolverBodySleepDataPool; float4* mBodyVelocities; PxU32 mNbBodies; PxU32 mTotalBodies; IG::IslandSim* mIslandSim; public: PxgPostSolveWorkerTask(PxNodeIndex* nodeIndices, PxAlignedTransform* bodyToWorldPool, PxgSolverBodySleepData* solverBodySleepDataPool, float4* bodyVelocities, PxU32 nbBodies, PxU32 totalBodies, IG::IslandSim* islandSim) : Cm::Task(0), mNodeIndices(nodeIndices), mBodyToWorldPool(bodyToWorldPool), mSolverBodySleepDataPool(solverBodySleepDataPool), mBodyVelocities(bodyVelocities), mNbBodies(nbBodies), mTotalBodies(totalBodies), mIslandSim(islandSim) { } virtual void runInternal() PX_OVERRIDE PX_FINAL { PX_PROFILE_ZONE("GpuDynamics.PxgPostSolveWorkerTask", 0); // AD: skip this if we had GPU errors, will lead to asserts down below // for signalling reasons we skip outside. //copy data from PxgSolverBodyData to PxsBodyCore for (PxU32 i = 0; i < mNbBodies; i++) { const PxU32 index = mNodeIndices[i].index(); //copy integration data const PxgSolverBodySleepData& sleepData = mSolverBodySleepDataPool[i]; PxsRigidBody& originalBody = *getRigidBodyFromIG(*mIslandSim, PxNodeIndex(index)); PxsBodyCore& bodyCore = originalBody.getCore(); originalBody.mLastTransform = bodyCore.body2World; const PxAlignedTransform& body2World = mBodyToWorldPool[i]; bodyCore.body2World = body2World.getTransform(); const float4& linVel = mBodyVelocities[i]; const float4& angVel = mBodyVelocities[i + mTotalBodies]; bodyCore.linearVelocity = PxVec3(linVel.x, linVel.y, linVel.z); bodyCore.angularVelocity = PxVec3(angVel.x, angVel.y, angVel.z); //copy sleep check data bodyCore.solverWakeCounter = sleepData.wakeCounter; originalBody.mInternalFlags = PxU8(sleepData.internalFlags); PX_ASSERT(bodyCore.linearVelocity.isFinite()); PX_ASSERT(bodyCore.angularVelocity.isFinite()); } } virtual const char* getName() const PX_OVERRIDE PX_FINAL { return "PxgPostSolveWorkerTask"; } private: PX_NOCOPY(PxgPostSolveWorkerTask) }; class PxgPostSolveArticulationTask : public Cm::Task { PxNodeIndex* mNodeIndices; //see PxgArticulationLinkJointRootStateData PxU8* mLinkAndJointAndRootStates; Dy::ErrorAccumulator* mInternalResidualPerArticulationVelIter; Dy::ErrorAccumulator* mInternalResidualPerArticulationPosIter; PxgSolverBodySleepData* mSleepData; PxU32 mNbArticulations; PxU32 mArticulationStartIndex; //articulation offset in the nodeIndex PxU32 mBatchStartIndex; IG::SimpleIslandManager* mIslandManager; PxU32 mMaxLinks; PxU32 mMaxDofs; PxReal mDt; PxU32 mArticulationCount; public: PxgPostSolveArticulationTask(PxNodeIndex* nodeIndices, PxU8* linkAndJointAndRootStates, Dy::ErrorAccumulator* internalResidualPerArticulationPosIter, Dy::ErrorAccumulator* internalResidualPerArticulationVelIter, PxgSolverBodySleepData* sleepData, PxU32 nbArticulation, PxU32 articulationStartIndex, IG::SimpleIslandManager* islandManager, const PxU32 batchStartIndex, const PxU32 maxLinks, const PxU32 maxDofs, const PxReal dt, const PxU32 totalArticulationCount) : Cm::Task(0), mNodeIndices(nodeIndices), mLinkAndJointAndRootStates(linkAndJointAndRootStates), mInternalResidualPerArticulationVelIter(internalResidualPerArticulationVelIter), mInternalResidualPerArticulationPosIter(internalResidualPerArticulationPosIter), mSleepData(sleepData), mNbArticulations(nbArticulation), mArticulationStartIndex(articulationStartIndex), mBatchStartIndex(batchStartIndex), mIslandManager(islandManager), mMaxLinks(maxLinks), mMaxDofs(maxDofs), mDt(dt), mArticulationCount(totalArticulationCount) { } virtual void runInternal() PX_OVERRIDE PX_FINAL { PX_PROFILE_ZONE("GpuDynamics.PxgPostSolveArticulationTask", 0); const PxU32 maxLinks = mMaxLinks; const PxU32 maxDofs = mMaxDofs; //copy data from PxgSolverBodyData to PxsBodyCore const PxU32 endIndex = mBatchStartIndex + mNbArticulations; IG::IslandSim& sim = mIslandManager->getAccurateIslandSim(); for (PxU32 a = mBatchStartIndex; a < endIndex; a++) { const PxU32 ind = a + mArticulationStartIndex; PxNodeIndex nodeIndex = mNodeIndices[ind]; //const PxU32 nodeIndex = mNodeIndices[ind].index(); //copy integration data Dy::FeatherstoneArticulation& articulation = *getArticulationFromIG(sim, nodeIndex); Dy::ArticulationData& artiData = articulation.getArticulationData(); articulation.mInternalErrorAccumulatorPosIter = mInternalResidualPerArticulationPosIter[a]; articulation.mInternalErrorAccumulatorVelIter = mInternalResidualPerArticulationVelIter[a]; articulation.mContactErrorAccumulatorPosIter = mInternalResidualPerArticulationPosIter[a + mArticulationCount]; articulation.mContactErrorAccumulatorVelIter = mInternalResidualPerArticulationVelIter[a + mArticulationCount]; artiData.setDt(mDt); const PxU32 numLinks = artiData.getLinkCount(); const PxU32 numDofs = artiData.getDofs(); //Get the address of the buffer holding the state data for the current articulation. PxU8* singleArticulationStateBuffer = PxgArticulationLinkJointRootStateData::getArticulationStateDataBuffer( mLinkAndJointAndRootStates, maxLinks, maxDofs, a); //Decompose the buffer into its sub-arrays. PxTransform* sBody2Worlds = NULL; Cm::UnAlignedSpatialVector* sLinkVelocities = NULL; Cm::UnAlignedSpatialVector* sLinkAccelerations = NULL; Cm::UnAlignedSpatialVector* sLinkIncomingJointForces = NULL; PxReal* sJointPositions = NULL; PxReal* sJointVelocities = NULL; PxReal* sJointAccels = NULL; Cm::UnAlignedSpatialVector* sRootPreVel = NULL; PxgArticulationLinkJointRootStateData::decomposeArticulationStateDataBuffer( singleArticulationStateBuffer, numLinks, numDofs, sBody2Worlds, sLinkVelocities, sLinkAccelerations, sLinkIncomingJointForces, sJointPositions, sJointVelocities, sJointAccels, sRootPreVel); Dy::ArticulationCore* core = articulation.getCore(); core->wakeCounter = mSleepData[a].wakeCounter; if (mSleepData[a].internalFlags & PxsRigidBody::eACTIVATE_THIS_FRAME) { mIslandManager->getAccurateIslandSim().activateNode_ForGPUSolver(nodeIndex); mIslandManager->getSpeculativeIslandSim().activateNode_ForGPUSolver(nodeIndex); } else if (mSleepData[a].internalFlags & PxsRigidBody::eDEACTIVATE_THIS_FRAME) { mIslandManager->getAccurateIslandSim().deactivateNode_ForGPUSolver(nodeIndex); mIslandManager->getSpeculativeIslandSim().deactivateNode_ForGPUSolver(nodeIndex); } Dy::ArticulationLink* links = artiData.getLinks(); Cm::SpatialVectorF* linkVelocities = artiData.getMotionVelocities(); Cm::SpatialVectorF* linkAccelerations = artiData.getMotionAccelerations(); Cm::SpatialVectorF* linkIncomingJointForces = artiData.getLinkIncomingJointForces(); for (PxU32 i = 0; i < numLinks; ++i) { Dy::ArticulationLink& link = links[i]; PX_ASSERT(sBody2Worlds[i].isValid()); link.bodyCore->body2World = sBody2Worlds[i]; link.bodyCore->angularVelocity = sLinkVelocities[i].top; link.bodyCore->linearVelocity = sLinkVelocities[i].bottom; linkVelocities[i].top = sLinkVelocities[i].top; linkVelocities[i].bottom = sLinkVelocities[i].bottom; linkAccelerations[i].top = sLinkAccelerations[i].top; linkAccelerations[i].bottom = sLinkAccelerations[i].bottom; linkIncomingJointForces[i].top = sLinkIncomingJointForces[i].top; linkIncomingJointForces[i].bottom = sLinkIncomingJointForces[i].bottom; } linkIncomingJointForces[0].top = PxVec3(PxZero); linkIncomingJointForces[0].bottom = PxVec3(PxZero); PxReal* jointPositions = artiData.getJointPositions(); PxReal* jointVelocities = artiData.getJointVelocities(); PxReal* jointAccelerations = artiData.getJointAccelerations(); for (PxU32 i = 0; i < numDofs; ++i) { jointPositions[i] = sJointPositions[i]; jointVelocities[i] = sJointVelocities[i]; jointAccelerations[i] = sJointAccels[i]; } artiData.setRootPreMotionVelocity(*sRootPreVel); } } virtual const char* getName() const PX_OVERRIDE PX_FINAL { return "PxgPostSolveArticulationTask"; } private: PX_NOCOPY(PxgPostSolveArticulationTask) }; void PxgGpuContext::processPatches( Cm::FlushPool& flushPool, PxBaseTask* continuation, PxsContactManager** lostFoundPatchManagers, PxU32 nbLostFoundPatchManagers, PxsContactManagerOutputCounts* outCounts) { mIncrementalPartition.processLostFoundPatches( flushPool, continuation, mIslandManager.getAccurateIslandSim(), getSimulationController()->getBodySimManager(), getSimulationController()->getJointManager(), lostFoundPatchManagers, nbLostFoundPatchManagers, outCounts); } void PxgGpuContext::doPostSolveTask(physx::PxBaseTask* continuation) { if (!mSolvedThisFrame) return; // AD: sneaky, but apparently only narrowphasecore has that member public. if (getNarrowphaseCore()->mCudaContext->isInAbortMode()) return; const PxU32 numParticleCores = mGpuParticleSystemCores.size(); for (PxU32 i = 0; i < numParticleCores; ++i) { PxgParticleSystemCore* core = mGpuParticleSystemCores[i]; const PxReal eps = 0.f;// mLengthScale * 1e-4f; core->integrateSystems(mDt, eps*eps); core->onPostSolve(); // call the callback. } PxU32 nbThresholdElems = 0; mGpuSolverCore->syncDmaBack(nbThresholdElems); mForceChangedThresholdStream->forceSize_Unsafe(nbThresholdElems); if (!mEnableDirectGPUAPI || getSimulationController()->getEnableOVDReadback()) { //TODO - multi-thread this! const PxU32 offset = 1 + mKinematicCount; PxPinnedArray& solverBodyIter = mSolverBodyPool; float4* bodyVelocities = reinterpret_cast(solverBodyIter.begin()); PxAlignedTransform* body2Worlds = mBody2WorldPool.begin(); PxNodeIndex* nodeIndices = mActiveNodeIndex.begin(); const PxU32 totalNumBodies = mSolverBodyPool.size(); const PxU32 batchSize = 512; IG::IslandSim* accurateIslandSim = &mIslandManager.getAccurateIslandSim(); //write back the data to PxsBodyCore for (PxU32 i = offset; i < totalNumBodies; i += batchSize) { PxgSolverBodySleepData* sleepData = &mSolverBodySleepDataPool[i]; PxgPostSolveWorkerTask* task = PX_PLACEMENT_NEW(mFlushPool.allocate(sizeof(PxgPostSolveWorkerTask)), PxgPostSolveWorkerTask)(nodeIndices + i, body2Worlds + i, sleepData, bodyVelocities + i, PxMin(batchSize, totalNumBodies - i), totalNumBodies, accurateIslandSim); task->setContinuation(continuation); task->removeReference(); } const PxU32 maxLinks = getSimulationController()->getSimulationCore()->getMaxArticulationLinks(); const PxU32 maxDofs = getSimulationController()->getSimulationCore()->getMaxArticulationDofs(); const PxU32 articulationBatchSize = PxMax(64u, (mArticulationCount + 127u) / 128u); for (PxU32 i = 0; i < mArticulationCount; i += articulationBatchSize) { PxgPostSolveArticulationTask* task = PX_PLACEMENT_NEW(mFlushPool.allocate(sizeof(PxgPostSolveArticulationTask)), PxgPostSolveArticulationTask)(nodeIndices, mLinkAndJointAndRootStateDataPool.begin(), mInternalResidualPerArticulationPosIter.begin(), mInternalResidualPerArticulationVelIter.begin(), mArticulationSleepDataPool.begin(), PxMin(articulationBatchSize, mArticulationCount - i), mArticulationStartIndex, &mIslandManager, i, maxLinks, maxDofs, mDt, mArticulationCount); task->setContinuation(continuation); task->removeReference(); } } mGpuSolverCore->acquireContext(); for (PxU32 i = 0; i < numParticleCores; ++i) { PxgParticleSystemCore* core = mGpuParticleSystemCores[i]; cuStreamQuery(core->getFinalizeStream()); //Flush particle work } mGpuSolverCore->releaseContext(); } static void copyToSolverBodyStaticAndKinematic(PxgSolverBodyData& data, PxgSolverTxIData& txIData, const PxsBodyCore& core, PxNodeIndex nodeIndex) { // PT: not needed for statics/kinematics // if(core.disableGravity) // sleepData.internalFlags |= PxsRigidBody::eDISABLE_GRAVITY_GPU; //This data has been moved to pxgbodysim //data.inverseInertia = make_float4(core.inverseInertia.x, core.inverseInertia.y, core.inverseInertia.z, 0.f); //PxU32 islandNodeIndex = nodeIndex << 2; ////Enable CCD... //if (core.mFlags & PxRigidBodyFlag::eENABLE_SPECULATIVE_CCD) // islandNodeIndex |= 1; //if (originalBody.mInternalFlags & PxsRigidBody::eHAS_SURFACE_VELOCITY) // islandNodeIndex |= 2; data.islandNodeIndex = nodeIndex; // Copy simple properties data.initialLinVel = core.linearVelocity; data.initialAngVel = core.angularVelocity; txIData.sqrtInvInertia = PxMat33(PxZero); txIData.deltaBody2World = PxTransform(PxIdentity); PX_ASSERT(core.linearVelocity.isFinite()); PX_ASSERT(core.angularVelocity.isFinite()); data.invMass = core.inverseMass; data.penBiasClamp = core.maxPenBias; //data.writeIndex = PxgSolverBody::InvalidHandle; data.reportThreshold = core.contactReportThreshold; data.maxImpulse = core.maxContactImpulse; data.offsetSlop = 0.0f; data.body2World = PxAlignedTransform(core.body2World.p.x, core.body2World.p.y, core.body2World.p.z, PxAlignedQuat(core.body2World.q.x, core.body2World.q.y, core.body2World.q.z, core.body2World.q.w)); data.flags = PxRigidBodyFlag::eKINEMATIC; } static void atomArticulationIntegration(const PxU32 numArticulations, const PxNodeIndex* const PX_RESTRICT islandNodes, IG::SimpleIslandManager& islandManager, PxI32* maxPosIters, PxI32* maxVelIters) { PxU32 localMaxPosIter = 0, localMaxVelIter = 0; for (PxU32 a = 0; a < numArticulations; ++a) { const PxNodeIndex nodeId = islandNodes[a]; //const PxU32 nodeIndex = nodeId.index(); Dy::FeatherstoneArticulation* artic = getArticulationFromIG(islandManager.getAccurateIslandSim(), nodeId); const PxU16 iterCount = artic->getIterationCounts(); localMaxPosIter = PxMax(PxU32(iterCount & 0xff), localMaxPosIter); localMaxVelIter = PxMax(PxU32(iterCount >> 8), localMaxVelIter); } PxAtomicMax(maxPosIters, (PxI32)localMaxPosIter); PxAtomicMax(maxVelIters, (PxI32)localMaxVelIter); } class PxgSetupKinematicTask : public Cm::Task { const PxNodeIndex* const PX_RESTRICT mKinematicNodes; PxNodeIndex* mActiveNodeIndex; //copy island node index into this list const PxU32 mNumBodies; IG::SimpleIslandManager& mIslandManager; PxU32 mSolverBodyStartIndex; PxgSolverBodyData* mSolverBodyDataPool; PxgSolverBodySleepData* mSolverBodySleepDataPool; PxgSolverTxIData* mSolverTxIData; PX_NOCOPY(PxgSetupKinematicTask) public: PxgSetupKinematicTask(const PxNodeIndex* const PX_RESTRICT kinematicNodes, PxNodeIndex* activeNodeIndex, const PxU32 numBodies, IG::SimpleIslandManager& islandManager, PxU32 solverBodyStartIndex, PxgSolverBodyData* solverBodyDataPool, PxgSolverBodySleepData* solverBodySleepDataPool, PxgSolverTxIData* txIData) : Cm::Task(0), mKinematicNodes(kinematicNodes), mActiveNodeIndex(activeNodeIndex), mNumBodies(numBodies), mIslandManager(islandManager), mSolverBodyStartIndex(solverBodyStartIndex), mSolverBodyDataPool(solverBodyDataPool), mSolverBodySleepDataPool(solverBodySleepDataPool), mSolverTxIData(txIData) { } virtual void runInternal() PX_OVERRIDE PX_FINAL { IG::IslandSim& islandSim = mIslandManager.getAccurateIslandSim(); //Set up solver bodies for any kinematic bodies for (PxU32 i = 0; i < mNumBodies; i++) { PxsRigidBody& rigidBody = *getRigidBodyFromIG(islandSim, mKinematicNodes[i]); const PxsBodyCore& core = rigidBody.getCore(); copyToSolverBodyStaticAndKinematic(mSolverBodyDataPool[i], mSolverTxIData[i], core, mKinematicNodes[i]); //mActiveNodeIndex[mSolverBodyStartIndex + i] = mKinematicNodes[i]; rigidBody.saveLastCCDTransform(); } } virtual const char* getName() const PX_OVERRIDE PX_FINAL { return "PxgKinematicSetupTask"; } }; class PxgAtomIntegrationTask : public Cm::Task { const PxNodeIndex* const PX_RESTRICT mIslandNodes; const PxU32 mNumBodies; PxI32* mMaxPosIters; PxI32* mMaxVelIters; IG::SimpleIslandManager& mIslandManager; PX_NOCOPY(PxgAtomIntegrationTask) public: PxgAtomIntegrationTask(const PxNodeIndex* const PX_RESTRICT islandNodes, const PxU32 numBodies, PxI32* PX_RESTRICT maxPosIters, PxI32* PX_RESTRICT maxVelIters, IG::SimpleIslandManager& islandManager) : Cm::Task(0), mIslandNodes(islandNodes), mNumBodies(numBodies), mMaxPosIters(maxPosIters), mMaxVelIters(maxVelIters), mIslandManager(islandManager) { } virtual void runInternal() PX_OVERRIDE PX_FINAL { PX_PROFILE_ZONE("GpuDynamics.PxgIntegrateTask", 0); PxI32 localPosIters = 0; PxI32 localVelIters = 0; IG::IslandSim& sim = mIslandManager.getAccurateIslandSim(); for (PxU32 i = 0; i < mNumBodies; ++i) { const PxNodeIndex nodeId = mIslandNodes[i]; //activeNodeIndex[startIndex] = nodeId; PxsRigidBody& rigidBody = *getRigidBodyFromIG(sim, nodeId); localPosIters = PxMax(PxI32(rigidBody.mSolverIterationCounts & 0xff), localPosIters); localVelIters = PxMax(PxI32(rigidBody.mSolverIterationCounts >> 8), localVelIters); } PxAtomicMax(mMaxPosIters, localPosIters); PxAtomicMax(mMaxVelIters, localVelIters); } virtual const char* getName() const PX_OVERRIDE PX_FINAL { return "PxgIntegrateTask"; } }; class PxgArticulationAtomIntegrationTask : public Cm::Task { const PxNodeIndex* const PX_RESTRICT mIslandNodes; const PxU32 mNumArticulations; PxI32* mMaxPosIters; PxI32* mMaxVelIters; IG::SimpleIslandManager& mIslandManager; PX_NOCOPY(PxgArticulationAtomIntegrationTask) public: PxgArticulationAtomIntegrationTask( const PxNodeIndex* const PX_RESTRICT islandNodes, const PxU32 numArticulations, PxI32* maxPosIters, PxI32* maxVelIters, IG::SimpleIslandManager& islandManager ) : Cm::Task(0), mIslandNodes(islandNodes), mNumArticulations(numArticulations), mMaxPosIters(maxPosIters), mMaxVelIters(maxVelIters), mIslandManager(islandManager) { } virtual void runInternal() PX_OVERRIDE PX_FINAL { PX_PROFILE_ZONE("GpuDynamics.PxgArticulationAtomIntegrationTask", 0); atomArticulationIntegration(mNumArticulations, mIslandNodes, mIslandManager, mMaxPosIters, mMaxVelIters); } virtual const char* getName() const PX_OVERRIDE PX_FINAL { return "PxgArticulationAtomIntegrationTask"; } }; void PxgGpuContext::doPreIntegrationTaskCommon(physx::PxBaseTask* continuation) { // AD: this task currently assumes we only have 1 solver island. If there is a variable amount of islands, // the dependency chain needs to be fixed, because this task runs in parallel to allocating and setting // the members of mIslandContextPool. (see Pxg(TGS)DynamicsContext::update()). mNumContactBatches = 0; mNum1dConstraintBatches = 0; mNumArtiContactBatches = 0; mNumArti1dConstraintBatches = 0; mArtiStaticConstraintBatchOffset = 0; mArtiStaticContactBatchOffset = 0; const IG::IslandSim& islandSim = mIslandManager.getAccurateIslandSim(); const PxU32 workerCount = PxMax(1u, continuation->getTaskManager()->getCpuDispatcher()->getWorkerCount()); const PxU32 atomBatchSize = PxMax(256u, PxMin(1024u, (mBodyCount + workerCount - 1) / workerCount)); const PxNodeIndex* const PX_RESTRICT nodeIndices = islandSim.getActiveNodes(IG::Node::eRIGID_BODY_TYPE); mGpuSolverCore->acquireContext(); const PxNodeIndex* const PX_RESTRICT articulationNodeIndices = islandSim.getActiveNodes(IG::Node::eARTICULATION_TYPE); //Because we need to put the articulation active node index into the same list as mActiveNodeIndex, so we need to make sure //articulation active node index start in the right place. In the active node index list, we start with static + kinematic + //active rigid bodies + active articulations //const PxU32 articulationStartIndex = island.mBodyStartIndex + island.mBodyCount; if (isStateDirty()) { mCachedPositionIterations = 0; mCachedVelocityIterations = 0; //Loop through and fill in properties from all the rigid bodies... for (PxU32 a = 0; a < mBodyCount; a += atomBatchSize) { PxgAtomIntegrationTask* task = static_cast(mFlushPool.allocate(sizeof(PxgAtomIntegrationTask))); task = PX_PLACEMENT_NEW(task, PxgAtomIntegrationTask)(nodeIndices + a, PxMin(atomBatchSize, mBodyCount - a), &mCachedPositionIterations, &mCachedVelocityIterations, mIslandManager); task->setContinuation(continuation); task->removeReference(); } setStateDirty(false); const PxU32 articulationBatchSize = 1024u; for (PxU32 a = 0; a < mArticulationCount; a += articulationBatchSize) { PxgArticulationAtomIntegrationTask* task = static_cast(mFlushPool.allocate(sizeof(PxgArticulationAtomIntegrationTask))); task = PX_PLACEMENT_NEW(task, PxgArticulationAtomIntegrationTask)( articulationNodeIndices + a, PxMin(articulationBatchSize, mArticulationCount - a), &mCachedPositionIterations, &mCachedVelocityIterations, mIslandManager); task->setContinuation(continuation); task->removeReference(); } } const PxU32 kinematicBatchSize = 1024u; const PxNodeIndex*const kinematicIndices = islandSim.getActiveKinematics(); for (PxU32 a = 0; a < mKinematicCount; a += kinematicBatchSize) { PxgSetupKinematicTask* task = PX_PLACEMENT_NEW(mFlushPool.allocate(sizeof(PxgSetupKinematicTask)), PxgSetupKinematicTask) (kinematicIndices + a, mActiveNodeIndex.begin(), PxMin(mKinematicCount - a, kinematicBatchSize), mIslandManager, a + 1, mSolverBodyDataPool.begin() + a + 1, mSolverBodySleepDataPool.begin() + a + 1, mSolverTxIDataPool.begin() + a + 1); task->setContinuation(continuation); task->removeReference(); } PxgSimulationController* gpuSimController = static_cast(mSimulationController); //const PxU32 numParticles = gpuSimController->getNbParticleSystems(); PxgBodySimManager& bodySimManager = gpuSimController->getBodySimManager(); void** bodySimsLL = bodySimManager.mBodies.begin(); PxI32 maxPosIters = 0, maxVelIters = 0; const PxU32 numParticleCores = mGpuParticleSystemCores.size(); for (PxU32 i = 0; i < numParticleCores; ++i) { PxgParticleSystemCore* particleCore = mGpuParticleSystemCores[i]; particleCore->getMaxIterationCount(bodySimManager, maxPosIters, maxVelIters); } { //Need to implement soft body PxU32* softBodyNodeIndex = gpuSimController->getSoftBodyNodeIndex(); const PxU32 nbActiveSoftbodies = bodySimManager.mActiveSoftbodies.size(); PxU32* activeSoftbodies = bodySimManager.mActiveSoftbodies.begin(); for (PxU32 i = 0; i < nbActiveSoftbodies; ++i) { const PxU32 index = activeSoftbodies[i]; const PxU32 nodeIdex = softBodyNodeIndex[index]; Dy::DeformableVolume* dySoftBody = reinterpret_cast(bodySimsLL[nodeIdex]); const PxU16 solverIterationCounts = dySoftBody->getIterationCounts(); maxPosIters = PxMax(PxI32(solverIterationCounts & 0xff), maxPosIters); maxVelIters = PxMax(PxI32(solverIterationCounts >> 8), maxVelIters); } } { // FEM cloth PxU32* femClothNodeIndex = gpuSimController->getFEMClothNodeIndex(); const PxU32 nbActiveFEMCloths = bodySimManager.mActiveFEMCloths.size(); PxU32* activeFEMCloths = bodySimManager.mActiveFEMCloths.begin(); for (PxU32 i = 0; i < nbActiveFEMCloths; ++i) { const PxU32 index = activeFEMCloths[i]; const PxU32 nodeIdex = femClothNodeIndex[index]; Dy::DeformableSurface* dyFEMCloth = reinterpret_cast(bodySimsLL[nodeIdex]); const PxU16 solverIterationCounts = dyFEMCloth->getIterationCounts(); maxPosIters = PxMax(PxI32(solverIterationCounts & 0xff), maxPosIters); //maxVelIters = PxMax(PxI32(solverIterationCounts >> 8), maxVelIters); } } PxAtomicMax(&mCachedPositionIterations, maxPosIters); PxAtomicMax(&mCachedVelocityIterations, maxVelIters); mGpuSolverCore->releaseContext(); } void PxgGpuContext::doConstraintPrePrepCommon(physx::PxBaseTask* continuation) { mGpuSolverCore->acquireContext(); m1dConstraintBatchIndices.forceSize_Unsafe(0); m1dConstraintBatchIndices.reserve(mIncrementalPartition.getNbConstraintBatches() + mNumStaticRigid1dConstraintBatches); mContactConstraintBatchIndices.forceSize_Unsafe(0); mContactConstraintBatchIndices.reserve(mIncrementalPartition.getNbContactBatches() + mNumStaticRigidContactBatches); mArti1dConstraintBatchIndices.forceSize_Unsafe(0); mArti1dConstraintBatchIndices.reserve(mIncrementalPartition.getNbArtiConstraintBatches() + mNumStaticArti1dConstraintBatches + mNumSelfArti1dConstraintBatches); mArtiContactConstraintBatchIndices.forceSize_Unsafe(0); mArtiContactConstraintBatchIndices.reserve(mIncrementalPartition.getNbArtiContactBatches() + mNumStaticArtiContactBatches + mNumSelfArtiContactBatches); mIslandContextPool[0].mNumPositionIterations = mCachedPositionIterations; mIslandContextPool[0].mNumVelocityIterations = mCachedVelocityIterations; mNum1dConstraintBatches = (PxI32)mIncrementalPartition.getNbConstraintBatches(); mNumContactBatches = (PxI32)mIncrementalPartition.getNbContactBatches(); mNumArtiContactBatches = (PxI32)mIncrementalPartition.getNbArtiContactBatches(); mNumArti1dConstraintBatches = (PxI32)mIncrementalPartition.getNbArtiConstraintBatches(); PxgBodySimManager& bodyManager = getSimulationController()->getBodySimManager(); const PxU32 nbStaticSlabs = (PxMax(bodyManager.mMaxStaticRBJoints, bodyManager.mMaxStaticRBContacts) + mMaxNumStaticPartitions - 1) / mMaxNumStaticPartitions; const PxU32 maxCombinedSlabPartitions = mIncrementalPartition.getCombinedSlabMaxNbPartitions(); mGpuSolverCore->gpuMemDmaUpBodyData(mSolverBodyDataPool, mSolverTxIDataPool, mIslandManager.getNbNodeHandles() + 1, mNumConstraintBatches, mNumArticConstraintBatches, PxMax(1u, (mIncrementalPartition.getNbPartitions() + maxCombinedSlabPartitions - 1) / maxCombinedSlabPartitions), nbStaticSlabs, mMaxNumStaticPartitions); //Allocate enough space for the friction patches now that we know how many we need after constraint partitioning { PX_PROFILE_ZONE("GpuDynamics.allocateFrictionPatchStreams", 0); mGpuSolverCore->allocateFrictionPatchStream(mNumContactBatches + mNumStaticRigidContactBatches, mNumArtiContactBatches + mNumStaticArtiContactBatches + mNumSelfArtiContactBatches); } mNum1DConstraintBlockPrepPool = (PxU32)mNum1dConstraintBatches; const PxU32 nbConstraintsPerBatch = mIsTGS ? PxgCpuConstraintPrePrepTask::NbConstraintsPerTaskTGS : PxgCpuConstraintPrePrepTask::NbConstraintsPerTaskPGS; //Each task processed up to PxgCpuConstraintPrePrepTask::NbConstraintsPerTask constraints of a certain type const PxU32 nbArtiConstraintsPerBatch = mIsTGS ? PxgCpuArtiConstraintPrePrepTask::NbConstraintsPerTaskTGS : PxgCpuArtiConstraintPrePrepTask::NbConstraintsPerTaskPGS; PxU32 constraintBatchIndex = 0; PxU32 contactBatchIndex = 0; PxU32 articulationConstraintBatchIndex = mNum1dConstraintBatches; PxU32 articulationContactBatchIndex = mNumContactBatches; const PxU32 batchMask = PXG_BATCH_SIZE - 1; mHasForceThresholds = mIncrementalPartition.hasForceThresholds(); const PxInt32ArrayPinned& startSlabIter = mIncrementalPartition.getStartSlabPerPartition(); const PxInt32ArrayPinned& articstartSlabIter = mIncrementalPartition.getArticStartSlabPerPartition(); PxgJointManager& jointManager = static_cast(mSimulationController)->getJointManager(); const PxPinnedArray& rigidPreprepIter = jointManager.getGpuRigidJointPrePrep(); const PxPinnedArray& artiPreprepIter = jointManager.getGpuArtiJointPrePrep(); //The code below iterates over all partitions, producing tasks to fill in data. //Running indices PxU32 startIdx = 0; //Which partition to start at PxU32 startBatchOffset = 0; //Batch offset within the partition PxU32 startOffset = 0; //Constraint offset within the partition PxU32 runningContactCount = 0; //The running count of the number of contact constraints that will be processed by the next task PxU32 runningBatchCount = 0; //The running count of the number of batches that will be processed by the next task { PX_PROFILE_ZONE("Process Partitions", 0); for (PxU32 i = 0; i < mIncrementalPartition.getNbPartitions(); ++i) // this is looping over "true" partitions, not the combined ones for the solver { const Partition& partition = mIncrementalPartition.getPartitionSlabs()[i / PXG_BATCH_SIZE]->mPartitions[i&(PXG_BATCH_SIZE - 1)]; const PxU32 nbContacts = partition.mPartitionIndices[PxgEdgeType::eCONTACT_MANAGER].size(); const PxU32 nbConstraints = partition.mPartitionIndices[PxgEdgeType::eCONSTRAINT].size(); const PxU32 nbArtiContacts = partition.mPartitionIndices[PxgEdgeType::eARTICULATION_CONTACT].size(); const PxU32 nbArtiConstraints = partition.mPartitionIndices[PxgEdgeType::eARTICULATION_CONSTRAINT].size(); //PxU32* constraintIds = partition.mPartitionIndices[IG::Edge::eCONSTRAINT].begin(); const PartitionIndices& constraintIds = partition.mPartitionIndices[PxgEdgeType::eCONSTRAINT]; const PartitionIndices& artiConstraintIds = partition.mPartitionIndices[PxgEdgeType::eARTICULATION_CONSTRAINT]; const PartitionIndices& artiContactIds = partition.mPartitionIndices[PxgEdgeType::eARTICULATION_CONTACT]; const PxU32 jointStartIndex = mIncrementalPartition.getJointStartIndices()[i]; PxU32 batchIndex = startSlabIter[i]; PxU32 localArticBatchIndex = articstartSlabIter[i]; PxU32 batchOffset = 0; for (PxU32 a = 0; a < nbConstraints; a += nbConstraintsPerBatch) { PxU32 nbConstraintsToProcess = PxMin(nbConstraints - a, nbConstraintsPerBatch); PxU32 nbBatches = (nbConstraintsToProcess + batchMask) / PXG_BATCH_SIZE; PxgCpuConstraintPrePrepTask* task = (PxgCpuConstraintPrePrepTask*)mFlushPool.allocate(sizeof(PxgCpuConstraintPrePrepTask)); task = PX_PLACEMENT_NEW(task, PxgCpuConstraintPrePrepTask)(constraintIds, a, nbConstraintsToProcess, mConstraintBatchHeaders + batchIndex, nbBatches, constraintBatchIndex, jointStartIndex + a, mConstraintUniqueIndices, rigidPreprepIter.begin()); task->setContinuation(continuation); task->removeReference(); for (PxU32 b = 0; b < nbBatches; ++b) { PxU32 val = batchIndex + b; m1dConstraintBatchIndices.pushBack(val); } constraintBatchIndex += nbBatches; batchIndex += nbBatches; } PxU32 remainingContacts = nbContacts; PxU32 localOffset = 0; //While there are constraints in this partition, process them in chunks of ~nbConstraintsPerBatch while ((runningContactCount + remainingContacts) >= nbConstraintsPerBatch) { //We are aiming to process approximately 2048 constraints. However, to simplify the logic in the CPU PrePrep task, //we actually can process a little more than that to fill up entire batches. Each batch contains 32 constraints. PxU32 nbConstraintsFromThisPartition = nbConstraintsPerBatch - runningContactCount; //Number of constraints from this partition PxU32 nbBatchesFromThisPartition = ((nbConstraintsFromThisPartition + batchMask) / PXG_BATCH_SIZE); //The number of batches from this partition (groups of 32 constraints) //Round up the number of constraints from this partition to be full batches unless there are insufficient constraints in this partition to create a full batch nbConstraintsFromThisPartition = PxMin((nbConstraintsFromThisPartition + batchMask)&(~(batchMask)), remainingContacts); PxU32 totalBatches = runningBatchCount + nbBatchesFromThisPartition; PxU32 nbConstraintsToProcess = runningContactCount + nbConstraintsFromThisPartition; PxgCpuContactPrePrepTask* task = (PxgCpuContactPrePrepTask*)mFlushPool.allocate(sizeof(PxgCpuContactPrePrepTask)); task = PX_PLACEMENT_NEW(task, PxgCpuContactPrePrepTask)(mIncrementalPartition, startIdx, startOffset, nbConstraintsToProcess, startSlabIter.begin(), startBatchOffset, mIncrementalPartition.getContactStartIndices().begin(), mConstraintBatchHeaders, totalBatches, contactBatchIndex, mContactUniqueIndices, mOutputIterator, mPatchStreamAllocators[mCurrentContactStream]->mStart, mContactStreamAllocators[mCurrentContactStream]->mStart); task->setContinuation(continuation); task->removeReference(); //Update contact counts remainingContacts -= nbConstraintsFromThisPartition; localOffset += nbConstraintsFromThisPartition; for (PxU32 b = 0; b < nbBatchesFromThisPartition; ++b) { PxU32 val = batchIndex + b; mContactConstraintBatchIndices.pushBack(val); } //Update iteration indices in this partition contactBatchIndex += totalBatches; batchIndex += nbBatchesFromThisPartition; batchOffset += nbBatchesFromThisPartition; //Update global task iteration indices startIdx = i; startOffset = localOffset; startBatchOffset = batchOffset; runningContactCount = 0; runningBatchCount = 0; } //We have remaining constraints. If so, sum them up and continue iterating... PxU32 remainingBatches = (remainingContacts + batchMask) / PXG_BATCH_SIZE; runningContactCount += remainingContacts; runningBatchCount += remainingBatches; for (PxU32 b = 0; b < remainingBatches; ++b) { PxU32 val = batchIndex + b; mContactConstraintBatchIndices.pushBack(val); } //batchIndex += runningBatchCount; PxU32 localArtiJointStartIndex = mIncrementalPartition.getArtiJointStartIndices()[i]; //constraintBatchIndex += contactBatchIndex; //articulation constraints for (PxU32 a = 0; a < nbArtiConstraints; a += nbArtiConstraintsPerBatch) { //each constraint is a batch PxU32 nbConstraintsToProcess = PxMin(nbArtiConstraints - a, nbArtiConstraintsPerBatch); PxU32 nbBatchesFromThisPartition = ((nbConstraintsToProcess + batchMask) / PXG_BATCH_SIZE); //The number of batches from this partition (groups of 32 constraints) PxgCpuArtiConstraintPrePrepTask* task = (PxgCpuArtiConstraintPrePrepTask*)mFlushPool.allocate(sizeof(PxgCpuArtiConstraintPrePrepTask)); task = PX_PLACEMENT_NEW(task, PxgCpuArtiConstraintPrePrepTask)(artiConstraintIds, a, nbConstraintsToProcess, mArticConstraintBatchHeaders + localArticBatchIndex, nbBatchesFromThisPartition, articulationConstraintBatchIndex, localArtiJointStartIndex, mArtiConstraintUniqueIndices, artiPreprepIter.begin(), false); localArtiJointStartIndex += nbConstraintsToProcess; task->setContinuation(continuation); task->removeReference(); for (PxU32 b = 0; b < nbBatchesFromThisPartition; ++b) { PxU32 val = localArticBatchIndex + b; mArti1dConstraintBatchIndices.pushBack(val); } articulationConstraintBatchIndex += nbBatchesFromThisPartition; localArticBatchIndex += nbBatchesFromThisPartition; } PxU32 localArtiContactStartIndex = mIncrementalPartition.getArtiContactStartIndices()[i]; //articulation contacts for (PxU32 a = 0; a < nbArtiContacts; a += nbArtiConstraintsPerBatch) { //each contact is a batch PxU32 nbContactsToProcess = PxMin(nbArtiContacts - a, nbArtiConstraintsPerBatch); PxU32 nbBatchesFromThisPartition = ((nbContactsToProcess + batchMask) / PXG_BATCH_SIZE); //The number of batches from this partition (groups of 32 constraints) PxgCpuArtiConstraintPrePrepTask* task = (PxgCpuArtiConstraintPrePrepTask*)mFlushPool.allocate(sizeof(PxgCpuArtiConstraintPrePrepTask)); task = PX_PLACEMENT_NEW(task, PxgCpuArtiConstraintPrePrepTask)(artiContactIds, a, nbContactsToProcess, mArticConstraintBatchHeaders + localArticBatchIndex, nbBatchesFromThisPartition, articulationContactBatchIndex, localArtiContactStartIndex, mArtiContactUniqueIndices, artiPreprepIter.begin(), true); localArtiContactStartIndex += nbContactsToProcess; task->setContinuation(continuation); task->removeReference(); for (PxU32 b = 0; b < nbBatchesFromThisPartition; ++b) { PxU32 val = localArticBatchIndex + b; mArtiContactConstraintBatchIndices.pushBack(val); } articulationContactBatchIndex += nbBatchesFromThisPartition; localArticBatchIndex += nbBatchesFromThisPartition; } } if (runningBatchCount > 0) { //There are remaining unprocessed contact constraints PxgCpuContactPrePrepTask* task = (PxgCpuContactPrePrepTask*)mFlushPool.allocate(sizeof(PxgCpuContactPrePrepTask)); task = PX_PLACEMENT_NEW(task, PxgCpuContactPrePrepTask)(mIncrementalPartition, startIdx, startOffset, runningContactCount, startSlabIter.begin(), startBatchOffset, mIncrementalPartition.getContactStartIndices().begin(), mConstraintBatchHeaders, runningBatchCount, contactBatchIndex, mContactUniqueIndices, mOutputIterator, mPatchStreamAllocators[mCurrentContactStream]->mStart, mContactStreamAllocators[mCurrentContactStream]->mStart); task->setContinuation(continuation); task->removeReference(); } } doStaticArticulationConstraintPrePrep(continuation, articulationConstraintBatchIndex, articulationContactBatchIndex); doStaticRigidConstraintPrePrep(continuation); mGpuSolverCore->releaseContext(); } void PxgGpuContext::doConstraintPrePrepGPUCommon(bool hasForceThresholds) { mLostTouchTask->removeReference(); const PxU32 nbCombinedSlabPartitions = mIncrementalPartition.getCombinedSlabNbPartitions(); { mConstraintsPerPartition.forceSize_Unsafe(0); if (mConstraintsPerPartition.capacity() < nbCombinedSlabPartitions) mConstraintsPerPartition.reserve(2 * nbCombinedSlabPartitions); mArtiConstraintsPerPartition.forceSize_Unsafe(0); if (mArtiConstraintsPerPartition.capacity() < nbCombinedSlabPartitions) mArtiConstraintsPerPartition.reserve(2 * nbCombinedSlabPartitions); for (PxU32 a = 0; a < nbCombinedSlabPartitions; ++a) { mConstraintsPerPartition.pushBack(mIncrementalPartition.getCSlabAccumulatedPartitionCount(a)); mArtiConstraintsPerPartition.pushBack(mIncrementalPartition.getCSlabAccumulatedArtiPartitionCount(a)); } } mIslandContextPool->mStartPartitionIndex = 0; mIslandContextPool->mNumPartitions = nbCombinedSlabPartitions; mIslandContextPool->mBatchStartIndex = 0; mIslandContextPool->mBatchCount = mIncrementalPartition.getNbConstraintBatches() + mIncrementalPartition.getNbContactBatches(); mIslandContextPool->mArtiBatchStartIndex = 0; mIslandContextPool->mArtiBatchCount = mIncrementalPartition.getNbArtiConstraintBatches() + mIncrementalPartition.getNbArtiContactBatches(); //mIslandContextPool->mStaticArtiBatchCount = getSimulationController()->getBodySimManager().mTotalArticJoints + getSimulationController()->getBodySimManager().mTotalArticContacts; PxgJointManager& jointManager = getSimulationController()->getJointManager(); const PxU32 gpuRigidJointSize = jointManager.getGpuNbRigidConstraints(); const PxU32 cpuRigidJointSize = jointManager.getCpuNbRigidConstraints(); const PxU32 gpuArtiJointSize = jointManager.getGpuNbArtiConstraints(); const PxU32 cpuArtiJointSize = jointManager.getCpuNbArtiConstraints(); PxgConstraintPrePrepData ppData; ppData.nbGpuRigidJoints = gpuRigidJointSize; ppData.nbTotalRigidJoints = gpuRigidJointSize + cpuRigidJointSize; ppData.nbGpuArtiJoints = gpuArtiJointSize; ppData.nbTotalArtiJoints = gpuArtiJointSize + cpuArtiJointSize; ppData.numContactBatches = PxU32(mNumContactBatches); ppData.num1dConstraintBatches = PxU32(mNum1dConstraintBatches); ppData.numStaticContactBatches = PxU32(mNumStaticRigidContactBatches); ppData.numStatic1dConstraintBatches = PxU32(mNumStaticRigid1dConstraintBatches); ppData.numArtiContactsBatches = PxU32(mNumArtiContactBatches); ppData.numArti1dConstraintBatches = PxU32(mNumArti1dConstraintBatches); ppData.numArtiStaticContactsBatches = PxU32(mNumStaticArtiContactBatches); ppData.numArtiStatic1dConstraintBatches = PxU32(mNumStaticArti1dConstraintBatches); ppData.numArtiSelfContactsBatches = PxU32(mNumSelfArtiContactBatches); ppData.numArtiSelf1dConstraintBatches = PxU32(mNumSelfArti1dConstraintBatches); ppData.artiStaticConstraintBatchOffset = PxU32(mArtiStaticConstraintBatchOffset); ppData.artiStaticContactBatchOffset = PxU32(mArtiStaticContactBatchOffset); ppData.contactUniqueIndices = mContactUniqueIndices; ppData.constraintUniqueIndices = mConstraintUniqueIndices; ppData.artiContactUniqueIndices = mArtiContactUniqueIndices; ppData.artiConstraintUniqueindices = mArtiConstraintUniqueIndices; ppData.artiStaticConstraintUniqueIndices = mArtiStaticConstraintUniqueIndices; ppData.artiStaticContactUniqueIndices = mArtiStaticContactUniqueIndices; ppData.artiStaticConstraintStartIndex = mArtiStaticConstraintStartIndex; ppData.artiStaticConstraintCount = mArtiStaticConstraintCount; ppData.artiStaticContactStartIndex = mArtiStaticContactStartIndex; ppData.artiStaticContactCount = mArtiStaticContactCount; ppData.constraint1DBatchIndices = m1dConstraintBatchIndices.begin(); ppData.constraintContactBatchIndices = mContactConstraintBatchIndices.begin(); ppData.artiConstraint1dBatchindices = mArti1dConstraintBatchIndices.begin(); ppData.artiConstraintContactBatchIndices = mArtiContactConstraintBatchIndices.begin(); PxgConstantData cData; cData.dt = mDt; cData.invDtF32 = mInvDt; cData.bounceThresholdF32 = mBounceThreshold; cData.frictionOffsetThreshold = mFrictionOffsetThreshold; cData.correlationDistance = mCorrelationDistance; cData.ccdMaxSeparation = mCCDSeparationThreshold; cData.biasCoefficient = mIslandContextPool->mBiasCoefficient; cData.gravity = mGravity; PxgBodySimManager& bodySimManager = getSimulationController()->getBodySimManager(); PxgPartitionData pData; pData.constraintsPerPartition = mConstraintsPerPartition.begin(); pData.numConstraintsPerPartition = mConstraintsPerPartition.size(); pData.artiConstraintsPerPartition = mArtiConstraintsPerPartition.begin(); pData.numArtiConstraintsPerPartition = mArtiConstraintsPerPartition.size(); pData.numTotalContacts = mIncrementalPartition.getTotalContacts(); pData.numTotalStaticConstraints = bodySimManager.mTotalStaticRBJoints; pData.numTotalStaticContacts = bodySimManager.mTotalStaticRBContacts; pData.numTotalConstraints = mIncrementalPartition.getTotalConstraints(); pData.numTotalArtiContacts = mIncrementalPartition.getTotalArticulationContacts(); pData.numTotalArtiConstraints = mIncrementalPartition.getTotalArticulationConstraints(); pData.numTotalArtiStaticContacts = bodySimManager.mTotalStaticArticContacts; pData.numTotalArtiStaticConstraints = bodySimManager.mTotalStaticArticJoints; pData.numTotalArtiSelfContacts = bodySimManager.mTotalSelfArticContacts; pData.numTotalArtiSelfConstraints = bodySimManager.mTotalSelfArticJoints; pData.artiStaticConstraintBatchOffset = mArtiStaticConstraintBatchOffset; pData.artiStaticContactBatchOffset = mArtiStaticContactBatchOffset; mIslandContextPool->mStaticArtiBatchCount = mNumArtiStaticConstraintBatches; mIslandContextPool->mSelfArtiBatchCount = mNumArtiSelfConstraintBatches; mIslandContextPool->mStaticRigidBatchCount = mNumRigidStaticConstraintBatches; const PxU32 maxCombinedSlabPartitions = mIncrementalPartition.getCombinedSlabMaxNbPartitions(); const PxU32 nbSlabs = PxMax(1u, (mIncrementalPartition.getNbPartitions() + maxCombinedSlabPartitions - 1) / maxCombinedSlabPartitions); const PxU32 nbPartitions = PxMin(mIncrementalPartition.getNbPartitions(), maxCombinedSlabPartitions); mGpuArticulationCore->allocDeltaVBuffer(nbSlabs, nbPartitions, mGpuSolverCore->getStream()); mGpuSolverCore->gpuMemDMAUp(*mPinnedMemoryAllocator, ppData, mSolverBodyPool.size(), mConstraintBatchHeaders, mIslandContextPool, mNumIslandContextPool, pData, mNumConstraintBatches, mNumRigidStaticConstraintBatches, mNumArticConstraintBatches, mNumArtiStaticConstraintBatches, mNumArtiSelfConstraintBatches, cData, PXG_MAX_NUM_POINTS_PER_CONTACT_PATCH * (mNumContactBatches + mNumStaticRigidContactBatches), 4u * (mNumContactBatches + mNumStaticRigidContactBatches), PXG_MAX_NUM_POINTS_PER_CONTACT_PATCH * (mNumArtiContactBatches + mNumStaticArtiContactBatches + mNumSelfArtiContactBatches), 4u * (mNumArtiContactBatches + mNumStaticArtiContactBatches + mNumSelfArtiContactBatches), mTotalEdges, mTotalPreviousEdges, nbSlabs, maxCombinedSlabPartitions, mEnableStabilization, mPatchStreamAllocators[mCurrentContactStream]->mStart, mContactStreamAllocators[mCurrentContactStream]->mStart, mForceStreamAllocator->mStart, mOutputIterator, mSolverBodyPool.size() - (mKinematicCount + 1), mKinematicCount + 1, mArticulationCount, reinterpret_cast(mGpuArticulationCore->getDeferredZ()), reinterpret_cast(mGpuArticulationCore->getArticulationDirty()), reinterpret_cast(mGpuArticulationCore->getArticulationSlabMask()), mGPUShapeInteractions, mGPURestDistances, mGPUTorsionalData, mArtiStaticContactIndices.begin(), mArtiStaticContactIndices.size(), mArtiStaticJointIndices.begin(), mArtiStaticJointIndices.size(), mArtiStaticContactCounts.begin(), mArtiStaticJointCounts.begin(), mArtiSelfContactIndices.begin(), mArtiSelfContactIndices.size(), mArtiSelfJointIndices.begin(), mArtiSelfJointIndices.size(), mArtiSelfContactCounts.begin(), mArtiSelfJointCounts.begin(), mRigidStaticContactIndices.begin(), mRigidStaticContactIndices.size(), mRigidStaticJointIndices.begin(), mRigidStaticJointIndices.size(), mRigidStaticContactCounts.begin(), mRigidStaticJointCounts.begin(), mLengthScale, hasForceThresholds); //Make sure that the GPU articulation work has completed now... mGpuArticulationCore->syncUnconstrainedVelocities(); mGpuArticulationCore->layoutDeltaVBuffer(nbSlabs, nbPartitions, mGpuSolverCore->getStream()); mGpuArticulationCore->createStaticContactAndConstraintsBatch(mArticulationCount); mGpuSolverCore->constraintPrePrepParallel(mNumConstraintBatches + mNumRigidStaticConstraintBatches + mNumArticConstraintBatches + mNumArtiStaticConstraintBatches + mNumArtiSelfConstraintBatches, gpuRigidJointSize + gpuArtiJointSize, mIslandContextPool->mBodyCount); } void PxgCpuJointPrePrepTask::runInternal() { PxU32 endIndex = mStartIndex + mNbToProcess; Px1DConstraint tempRows[Dy::MAX_CONSTRAINT_ROWS]; for (PxU32 i = mStartIndex; i < endIndex; ++i) { const Dy::Constraint* constraint = mConstraints[i]; const PxConstraintSolverPrep solverPrep = constraint->solverPrep; if (!solverPrep) continue; const PxTransform& pose0 = (constraint->body0 ? constraint->body0->getPose() : PxTransform(PxIdentity)); const PxTransform& pose1 = (constraint->body1 ? constraint->body1->getPose() : PxTransform(PxIdentity)); const void* constantBlock = constraint->constantBlock; PxgConstraintData& data = mConstraintData[i]; //Px1DConstraint* rows = &rowIter[i*Dy::MAX_CONSTRAINT_ROWS]; PxMemZero(tempRows, sizeof(Px1DConstraint)*Dy::MAX_CONSTRAINT_ROWS); for (PxU32 j = 0; j < Dy::MAX_CONSTRAINT_ROWS; j++) { Px1DConstraint& c = tempRows[j]; c.minImpulse = -PX_MAX_REAL; c.maxImpulse = PX_MAX_REAL; } PxConstraintInvMassScale ims(1.0f, 1.0f, 1.0f, 1.0f); PxVec3p ra, rb; PxVec3p body0WorldOffset(0.0f); //TAG:solverprepcall const PxU32 numRows = (constraint->flags & PxConstraintFlag::eDISABLE_CONSTRAINT) ? 0 :(*solverPrep)(tempRows, body0WorldOffset, Dy::MAX_CONSTRAINT_ROWS, ims, constantBlock, pose0, pose1, !!(constraint->flags & PxConstraintFlag::eENABLE_EXTENDED_LIMITS), ra, rb); data.mNumRows_Flags_StartIndex.x = numRows; if (numRows == 0) continue; ra -= pose0.p; rb -= pose1.p; data.mInvMassScale.linear0 = ims.linear0; data.mInvMassScale.angular0 = ims.angular0; data.mInvMassScale.linear1 = ims.linear1; data.mInvMassScale.angular1 = ims.angular1; data.mRaWorld_linBreakForceW = make_float4(ra.x, ra.y, ra.z, constraint->linBreakForce); data.mRbWorld_angBreakForceW = make_float4(rb.x, rb.y, rb.z, constraint->angBreakForce); data.mNumRows_Flags_StartIndex.y = constraint->flags; PxI32 startRowIndex = PxAtomicAdd(mRowCounts, PxI32(numRows)) - PxI32(numRows); PxMemCopy(mConstraintRows + startRowIndex, tempRows, sizeof(Px1DConstraint) * numRows); data.mNumRows_Flags_StartIndex.z = mGpuJointOffset + startRowIndex; } } void PxgGpuContext::cpuJointPrePrepTask(physx::PxBaseTask* continuation) { PxgJointManager& jointManager = getSimulationController()->getJointManager(); // AD: This could also be skipped with direct-GPU API, but at this point the constraints are already partitioned and I // cannot figure out how to remove the CPU joints from there again. const PxArray& cpuRigidConstraints = jointManager.getCpuRigidConstraints(); const PxArray& cpuArtiConstraints = jointManager.getCpuArtiConstraints(); const PxU32 nbCpuRigidConstraints = cpuRigidConstraints.size(); const PxU32 nbCpuArtiConstraints = cpuArtiConstraints.size(); const PxU32 gpuRigidJointOutputOffset = jointManager.getGpuNbRigidConstraints() * Dy::MAX_CONSTRAINT_ROWS; const PxU32 nbJointsPerTask = 128u; // PT: TODO: revisit //for other joint for (PxU32 a = 0; a < nbCpuRigidConstraints; a += nbJointsPerTask) { const PxU32 nbToProcess = PxMin(nbCpuRigidConstraints - a, nbJointsPerTask); PxgCpuJointPrePrepTask* task = reinterpret_cast(mFlushPool.allocate(sizeof(PxgCpuJointPrePrepTask))); task = PX_PLACEMENT_NEW(task, PxgCpuJointPrePrepTask)(*getSimulationController(), a, nbToProcess, gpuRigidJointOutputOffset, cpuRigidConstraints.begin(), jointManager.getCpuRigidConstraintData().begin(), jointManager.getCpuRigidConstraintRows().begin(), &jointManager.mNbCpuRigidConstraintRows); task->setContinuation(continuation); task->removeReference(); } const PxU32 gpuArtiJointOutputOffset = jointManager.getGpuNbArtiConstraints() * Dy::MAX_CONSTRAINT_ROWS; for (PxU32 a = 0; a < nbCpuArtiConstraints; a += nbJointsPerTask) { const PxU32 nbToProcess = PxMin(nbCpuArtiConstraints - a, nbJointsPerTask); PxgCpuJointPrePrepTask* task = reinterpret_cast(mFlushPool.allocate(sizeof(PxgCpuJointPrePrepTask))); task = PX_PLACEMENT_NEW(task, PxgCpuJointPrePrepTask)(*getSimulationController(), a, nbToProcess, gpuArtiJointOutputOffset, cpuArtiConstraints.begin(), jointManager.getCpuArtiConstraintData().begin(), jointManager.getCpuArtiConstraintRows().begin(), &jointManager.mNbCpuArtiConstraintRows); task->setContinuation(continuation); task->removeReference(); } } // This class figures out the max iteration counts for all actors, // and prepares some data for kinematics. void PxgCpuPreIntegrationTask::runInternal() { mContext.doPreIntegrationTaskCommon(mCont); } void PxgCpuContactPrePrepTask::runInternal() { PX_PROFILE_ZONE("GpuDynamics.PxgCpuContactPrePrepTask", 0); const PxU32 nbToProcess = mNumBatches; PxU32 nbProcessed = 0; PxU32 partitionIdx = mPartitionIndex; PxU32 partitionStartIdx = mStartIndexWithinPartition; PxU32 startSlabOffset = mStartSlabOffset; PxU32 workUnitIndex = mWorkUnitStartIndex; while (nbProcessed < nbToProcess) { //Extract current partition const Partition& partition = mPartition.getPartitionSlabs()[partitionIdx / PXG_BATCH_SIZE]->mPartitions[partitionIdx&(PXG_BATCH_SIZE - 1)]; //Get edgeIndices corresponding to this partition offset by partitionStartIdx const PartitionIndices& edgeIds = partition.mPartitionIndices[PxgEdgeType::eCONTACT_MANAGER];// +partitionStartIdx; //Factor in joint constraints to work out offsets in this partition. As this task can now process multiple partitions, //it is easiest just to compute them again here { const PxU32 nbConstraints = partition.mPartitionIndices[PxgEdgeType::eCONSTRAINT].size(); const PxU32 nbBatches = (nbConstraints + 31u) / PXG_BATCH_SIZE; startSlabOffset += nbBatches; } const PxU32 batchIndex = mStartSlabIter[partitionIdx] + startSlabOffset; const PxU32 uniqueStartIndex = mContactStartIndices[partitionIdx] + partitionStartIdx; //The number we process in this partition is equal to the smaller of (nbToProcess - nbProcessed) and (size of partition - startOffsetInPartition). const PxU32 nbRemaining = partition.mPartitionIndices[PxgEdgeType::eCONTACT_MANAGER].size() - partitionStartIdx; //Convert from constraints to batches const PxU32 nbBatchesToProcess = PxMin((nbToProcess - nbProcessed), (nbRemaining + 31) / PXG_BATCH_SIZE); PxU32 currentEdgeIndex = 0; for (PxU32 a = 0; a < nbBatchesToProcess; ++a) { const PxU32 descStride = PxMin(nbRemaining - currentEdgeIndex, PXG_BATCH_SIZE); PxgConstraintBatchHeader& batchHeader = mBatchHeaders[a + batchIndex]; batchHeader.constraintType = PxgSolverConstraintDesc::eCONTACT; batchHeader.mDescStride = PxU16(descStride); batchHeader.mConstraintBatchIndex = workUnitIndex++; batchHeader.mStartPartitionIndex = uniqueStartIndex + a * PXG_BATCH_SIZE; batchHeader.mask = 0xFFFFFFFF; //Unused #if PXG_CONTACT_VALIDATION validateContactPairs(a, a + descStride, edgeIds + a, mNpIds, mOutputIterator, mBaseContactPatch, mBaseContactPointer); #endif currentEdgeIndex += descStride; } for (PxU32 i = 0; i < nbRemaining; ++i) { const PxU32 uniqueId = edgeIds[i + partitionStartIdx]; mPinnedEdgeIds[uniqueStartIndex + i] = uniqueId; } nbProcessed += nbBatchesToProcess; partitionIdx++; partitionStartIdx = 0; startSlabOffset = 0; //PxMemCopy(mPinnedEdgeIds + uniqueStartIndex, edgeIds, sizeof(PxU32) * nbRemaining); } } void PxgGpuContext::allocateTempPinnedSolverMemoryCommon() { // AD: two stages. // 1. first figure out how much we need. Allocate PxMax(sizeNeeded, PxGpuDynamicsMemoryConfig::tempBufferCapacity). // 2. suballocate and set the pointers. // AD: old comment that moved here when outlining into a separate function. I don't know how relevant this still is. // KS - this may be over-allocating because, at this stage, we only know (1) how many articulation static contacts // we have in total, (2) how many is the max a given articulation has and (3) how many articulations we have. // We allocate the minimum of maxBatches * numArticulations, totalContacts. We will likely require less than // both of these counts, but this provides us with an upper-bound... // this code operates under the assumption that we only have 1 solver island on GPU. PxU64 sizeNeeded = 0; const PxU32 alignment = 128; // GPU cache line size. const PxU32 totalIslands = 1; const PxU64 totalIslandsAllocationSize = (totalIslands * sizeof(PxgIslandContext)) + alignment; sizeNeeded += totalIslandsAllocationSize; mNumConstraintBatches = mIncrementalPartition.getNbConstraintBatches() + mIncrementalPartition.getNbContactBatches(); PxgBodySimManager& bodyManager = getSimulationController()->getBodySimManager(); const PxU32 maxStaticRigidJoints = bodyManager.mMaxStaticRBJoints; const PxU32 maxStaticRigidContacts = bodyManager.mMaxStaticRBContacts; const PxU32 nbRigidBatches = (mBodyCount + PXG_BATCH_SIZE - 1) / PXG_BATCH_SIZE; const PxU32 totalStaticRigidContacts = bodyManager.mTotalStaticRBContacts; const PxU32 totalStaticRigidJoints = bodyManager.mTotalStaticRBJoints; mNumStaticRigidContactBatches = PxMin(maxStaticRigidContacts * nbRigidBatches, totalStaticRigidContacts); mNumStaticRigid1dConstraintBatches = PxMin(maxStaticRigidJoints * nbRigidBatches, totalStaticRigidJoints); mNumRigidStaticConstraintBatches = (mNumStaticRigidContactBatches + mNumStaticRigid1dConstraintBatches); mNumArticConstraintBatches = mIncrementalPartition.getNbArtiConstraintBatches() + mIncrementalPartition.getNbArtiContactBatches(); const PxU32 nbArticBatches = (mArticulationCount + PXG_BATCH_SIZE - 1) / PXG_BATCH_SIZE; const PxU32 maxStaticArticJoints = bodyManager.mMaxStaticArticJoints; const PxU32 maxStaticArticContacts = bodyManager.mMaxStaticArticContacts; const PxU32 totalStaticArticulationContacts = bodyManager.mTotalStaticArticContacts; const PxU32 totalStaticArticulationJoints = bodyManager.mTotalStaticArticJoints; mNumStaticArtiContactBatches = PxMin(maxStaticArticContacts * nbArticBatches, totalStaticArticulationContacts); mNumStaticArti1dConstraintBatches = PxMin(maxStaticArticJoints * nbArticBatches, totalStaticArticulationJoints); mNumArtiStaticConstraintBatches = (mNumStaticArtiContactBatches + mNumStaticArti1dConstraintBatches); const PxU32 maxSelfArticJoints = bodyManager.mMaxSelfArticJoints; const PxU32 maxSelfArticContacts = bodyManager.mMaxSelfArticContacts; const PxU32 totalSelfArticulationContacts = bodyManager.mTotalSelfArticContacts; const PxU32 totalSelfArticulationJoints = bodyManager.mTotalSelfArticJoints; mNumSelfArtiContactBatches = PxMin(maxSelfArticContacts * nbArticBatches, totalSelfArticulationContacts); mNumSelfArti1dConstraintBatches = PxMin(maxSelfArticJoints * nbArticBatches, totalSelfArticulationJoints); mNumArtiSelfConstraintBatches = (mNumSelfArtiContactBatches + mNumSelfArti1dConstraintBatches); const PxU64 allocationSizeConstraintBatchHeader = sizeof(PxgConstraintBatchHeader) * (mNumConstraintBatches + mNumRigidStaticConstraintBatches + mNumArticConstraintBatches + mNumArtiStaticConstraintBatches + mNumArtiSelfConstraintBatches); const PxU64 allocationSizeConstraintBatchHeaderAligned = allocationSizeConstraintBatchHeader + alignment; sizeNeeded += allocationSizeConstraintBatchHeaderAligned; const PxU32 totalJoints = mIncrementalPartition.getTotalConstraints(); const PxU32 totalContacts = mIncrementalPartition.getTotalContacts(); const PxU32 totalArticulationJoints = mIncrementalPartition.getTotalArticulationConstraints(); const PxU32 totalArticulationContacts = mIncrementalPartition.getTotalArticulationContacts(); //Unique Indices layout is joint->contact->artiJoint->artiContact const PxU64 allocationSizeUniqueIndices = (totalJoints + totalContacts + totalArticulationJoints + totalArticulationContacts + totalStaticArticulationJoints + totalStaticArticulationContacts + totalSelfArticulationContacts + totalSelfArticulationJoints + totalStaticRigidContacts + totalStaticRigidJoints) * sizeof(PxU32); const PxU64 allocationSizeUniqueIndicesAligned = allocationSizeUniqueIndices + alignment; sizeNeeded += allocationSizeUniqueIndicesAligned; const PxU64 allocationSizeArticulationCount = mArticulationCount * 4 * sizeof(PxU32); const PxU64 allocationSizeArticulationCountAligned = allocationSizeArticulationCount + alignment; sizeNeeded += allocationSizeArticulationCountAligned; const PxU64 allocationSizeBodyCount = mBodyCount * 2 * sizeof(PxU32); const PxU64 allocationSizeBodyCountAligned = allocationSizeBodyCount + alignment; sizeNeeded += allocationSizeBodyCountAligned; // descriptors are part of the solvercore sizeNeeded += mGpuSolverCore->getDescriptorsAllocationSize(); // phase 2 - actually allocate the memory mPinnedMemoryAllocator->reserveAndGrow(static_cast(sizeNeeded)); #if PX_ENABLE_SIM_STATS mSimStats.mGpuDynamicsTempBufferCapacity = PxMax(sizeNeeded,mSimStats.mGpuDynamicsTempBufferCapacity); #else PX_CATCH_UNDEFINED_ENABLE_SIM_STATS #endif mIslandContextPool = reinterpret_cast(mPinnedMemoryAllocator->allocate(totalIslands * sizeof(PxgIslandContext), alignment)); mConstraintBatchHeaders = reinterpret_cast(mPinnedMemoryAllocator->allocate(allocationSizeConstraintBatchHeader, alignment)); mArticConstraintBatchHeaders = mConstraintBatchHeaders + mNumConstraintBatches; mConstraintUniqueIndices = reinterpret_cast(mPinnedMemoryAllocator->allocate(allocationSizeUniqueIndices, alignment)); mRigidStaticConstraintUniqueIndices = mConstraintUniqueIndices + totalJoints; mArtiConstraintUniqueIndices = mRigidStaticConstraintUniqueIndices + totalStaticRigidJoints; mArtiStaticConstraintUniqueIndices = mArtiConstraintUniqueIndices + totalArticulationJoints; mArtiSelfConstraintUniqueIndices = mArtiStaticConstraintUniqueIndices + totalStaticArticulationJoints; mContactUniqueIndices = mArtiSelfConstraintUniqueIndices + totalSelfArticulationJoints; mRigidStaticContactUniqueIndices = mContactUniqueIndices + totalContacts; mArtiContactUniqueIndices = mRigidStaticContactUniqueIndices + totalStaticRigidContacts; mArtiStaticContactUniqueIndices = mArtiContactUniqueIndices + totalArticulationContacts; mArtiSelfContactUniqueIndices = mArtiStaticContactUniqueIndices + totalStaticArticulationContacts; mArtiStaticConstraintStartIndex = reinterpret_cast(mPinnedMemoryAllocator->allocate(allocationSizeArticulationCount, alignment)); mArtiStaticConstraintCount = mArtiStaticConstraintStartIndex + mArticulationCount; mArtiStaticContactStartIndex = mArtiStaticConstraintCount + mArticulationCount; mArtiStaticContactCount = mArtiStaticContactStartIndex + mArticulationCount; mRigidStaticConstraintStartIndex = reinterpret_cast(mPinnedMemoryAllocator->allocate(allocationSizeBodyCount, alignment)); mRigidStaticConstraintCount = mRigidStaticConstraintStartIndex + mBodyCount; mGpuSolverCore->allocatePinnedDescriptors(*mPinnedMemoryAllocator); } // PT: TODO: un-indent all of the above void PxgGpuContext::doConstraintPrepGPU() { PX_PROFILE_ZONE("GpuDynamics.ConstraintPrep", 0); /** * Things to do in here: * (1) constraint prep on GPU */ mGpuSolverCore->resetVelocities(mIsTGS); mGpuSolverCore->nonRigidConstraintPrepare(mArticulationCount); mGpuSolverCore->jointConstraintPrepareParallel(PxU32(mNum1dConstraintBatches + mNumStaticRigid1dConstraintBatches)); mGpuSolverCore->contactConstraintPrepareParallel(PxU32(mNumContactBatches + mNumStaticRigidContactBatches)); mGpuSolverCore->artiJointConstraintPrepare(PxU32(mNumArti1dConstraintBatches + mNumStaticArti1dConstraintBatches + mNumSelfArti1dConstraintBatches)); mGpuSolverCore->artiContactConstraintPrepare(PxU32(mNumArtiContactBatches + mNumStaticArtiContactBatches + mNumSelfArtiContactBatches)); mGpuArticulationCore->precomputeDependencies(PxMin(mIncrementalPartition.getNbPartitions(), mIncrementalPartition.getCombinedSlabMaxNbPartitions())); } void PxgGpuContext::doPreIntegrationGPU() { const PxU32 offset = 1 + mKinematicCount; mGpuSolverCore->preIntegration(offset, mSolverBodyPool.size(), mDt, mGravity); if(mIsTGS) mIslandContextPool->mBiasCoefficient = PxMin(0.9f, 2.0f * PxSqrt(1.0f / mIslandContextPool->mNumPositionIterations)); } void PxgGpuContext::doArticulationGPU() { if(mIsTGS) { mGpuArticulationCore->computeUnconstrainedVelocities(mArticulationStartIndex, mArticulationCount, mDt, mGravity, 1.0f/mLengthScale, mIsExternalForcesEveryTgsIterationEnabled, mRecomputeArticulationBlockFormat); } else { mGpuArticulationCore->computeUnconstrainedVelocities(mArticulationStartIndex, mArticulationCount, mDt, mGravity, 1.0f/mLengthScale, false, mRecomputeArticulationBlockFormat); mGpuArticulationCore->setupInternalConstraints(mArticulationCount, mDt, mDt, 1.0f / mDt, false); } } void PxgGpuContext::doSoftbodyGPU() { PxgSoftBodyCore* softBodyCore = static_cast(mSimulationController)->getSoftBodyCore(); if(softBodyCore) softBodyCore->updateTetraRotations(); } void PxgGpuContext::doFEMClothGPU() { // "I quickly checked, and it currently only resets Lagrange multiplier lambda used in the PBD framework. // For TGS, we don't use the Lagrange multiplier so no need to reset. Calling it on PGS only sounds okay to me." if(!mIsTGS) { PxgFEMClothCore* femClothCore = static_cast(mSimulationController)->getFEMClothCore(); if(femClothCore) femClothCore->preIteration(); } } void PxgGpuContext::doConstraintPrePrepGPU() { if(mIsTGS) { //Kick off articulation internal constraint setup code. At this point, we know the iteration count so we //know how large time-steps will be. const PxReal stepDt = mDt / PxReal(mIslandContextPool->mNumPositionIterations); mGpuArticulationCore->setupInternalConstraints(mArticulationCount, stepDt, mDt, 1.0f / stepDt, true); } doConstraintPrePrepGPUCommon(mHasForceThresholds); } void PxgPostSolveTask::runInternal() { mContext.doPostSolveTask(mCont); } //This class kicks off constraint solve on GPU void PxgGpuTask::runInternal() { mContext.mGpuSolverCore->acquireContext(); mContext.doConstraintJointBlockPrePrepGPU(); mContext.doConstraintPrepGPU(); mContext.doConstraintSolveGPU(mMaxNodes, *mChangedHandleMap); mContext.mGpuSolverCore->releaseContext(); } void PxgGpuIntegrationTask::runInternal() { mContext.mGpuSolverCore->acquireContext(); //for articulation mContext.doArticulationGPU(); //for soft body update rotation mContext.doSoftbodyGPU(); //for FEM-cloth mContext.doFEMClothGPU(); mContext.mGpuSolverCore->releaseContext(); } void PxgGpuPrePrepTask::runInternal() { mContext.mGpuSolverCore->acquireContext(); mContext.doPreIntegrationGPU(); //for d6 joint mContext.doConstraintPrePrepGPU(); PxgJointManager& jointManager = mContext.getSimulationController()->getJointManager(); jointManager.reserveMemory(Dy::MAX_CONSTRAINT_ROWS); mContext.mGpuSolverCore->releaseContext(); mContext.cpuJointPrePrepTask(mCont); } void PxgGpuContext::updateBodyCore(PxBaseTask* continuation) { mPostSolveTask.setContinuation(continuation); mPostSolveTask.removeReference(); } //#define PXG_INCREMENTAL_SANITY_CHECKS #if PX_ENABLE_ASSERTS #ifdef PXG_INCREMENTAL_SANITY_CHECKS template static bool noDuplicates(T* buffer, const PxU32 size) { for (PxU32 a = 0; a < size; ++a) { for (PxU32 b = 0; b < a; ++b) { if (buffer[a] == buffer[b]) return false; } } return true; } #else template static bool noDuplicates(T*, const PxU32) { return true; } #endif #endif static PX_FORCE_INLINE bool needsSolve(IG::IslandSim& islandSim, PxU32 bodyCount, PxU32 articulationCount) { const PxU32 particleCount = islandSim.getNbActiveNodes(IG::Node::ePARTICLESYSTEM_TYPE); const PxU32 clothCount = islandSim.getNbActiveNodes(IG::Node::eDEFORMABLE_SURFACE_TYPE); const PxU32 softBodyCount = islandSim.getNbActiveNodes(IG::Node::eDEFORMABLE_VOLUME_TYPE); const bool needsSolve = (0 != bodyCount || 0 != articulationCount || particleCount || softBodyCount || clothCount); return needsSolve; } void PxgGpuContext::update( Cm::FlushPool& flushPool, PxBaseTask* continuation, PxBaseTask* postPartitioningTask, PxBaseTask* /*lostTouchTask*/, PxvNphaseImplementationContext* nphase, PxU32 /*maxPatchesPerCM*/, PxU32 /*maxArticulationLinks*/, PxReal dt, const PxVec3& gravity, PxBitMapPinned& /*changedHandleMap*/) { mGpuSolverCore->acquireContext(); PxsContactManagerOutputIterator iterator = nphase->getContactManagerOutputs(); PxsContactManagerOutput* gpuContactManagerOutputs = nphase->getGPUContactManagerOutputBase(); mGPURestDistances = nphase->getGPURestDistances(); mGPUShapeInteractions = nphase->getGPUShapeInteractions(); mGPUTorsionalData = nphase->getGPUTorsionalData(); mSolvedThisFrame = false; mOutputIterator = iterator; PX_ASSERT(noDuplicates(nphase->getLostFoundPatchManagers(), nphase->getNbLostFoundPatchManagers())); //First and foremost, we need to get a set of islands (bodies, constraints etc.) //These will be parameters IG::IslandSim& islandSim = mIslandManager.getAccurateIslandSim(); const PxU32 bodyCount = islandSim.getNbActiveNodes(IG::Node::eRIGID_BODY_TYPE); const PxU32 articulationCount = islandSim.getNbActiveNodes(IG::Node::eARTICULATION_TYPE); mGpuSolverCore->setGpuContactManagerOutputBase(gpuContactManagerOutputs); if(!mIsTGS) mGpuSolverCore->syncSimulationController(); // PT: for some reason it's located here in PGS const PxU32 kinematicCount = islandSim.getNbActiveKinematics(); mKinematicCount = kinematicCount; mArticulationCount = articulationCount; mArticulationStartIndex = 1 + kinematicCount + bodyCount; mRecomputeArticulationBlockFormat = getSimulationController()->getRecomputeArticulationBlockFormat(); mBodyCount = bodyCount; mPinnedMemoryAllocator->reset(); #if PX_ENABLE_SIM_STATS mSimStats.mNbActiveKinematicBodies = islandSim.getNbActiveKinematics(); mSimStats.mNbActiveDynamicBodies = islandSim.getNbActiveNodes(IG::Node::eRIGID_BODY_TYPE); mSimStats.mNbActiveConstraints = islandSim.getNbActiveEdges(IG::Edge::eCONSTRAINT); mSimStats.mNbPartitions = mIncrementalPartition.getNbPartitions(); #else PX_CATCH_UNDEFINED_ENABLE_SIM_STATS #endif //mConstraintWriteBackStreamAllocator->reserve(sizeof(Dy::ConstraintWriteback) * nbConstraints); mConstraintsPerPartition.forceSize_Unsafe(0); mDt = dt; mInvDt = 1.f / dt; mGravity = gravity; //mEnableStabilization = enableStabilization; if(mIsTGS) mGpuSolverCore->syncSimulationController(); { PX_PROFILE_ZONE("Dynamics.allocateBodyBuffers", 0); const PxU32 maxLinks = getSimulationController()->getSimulationCore()->getMaxArticulationLinks(); const PxU32 maxDofs = getSimulationController()->getSimulationCore()->getMaxArticulationDofs(); const PxU32 totalLinkJointRootStateByteSize = PxgArticulationLinkJointRootStateData::computeStateDataBufferByteSizeAligned16(maxLinks, maxDofs, articulationCount); if (totalLinkJointRootStateByteSize > mLinkAndJointAndRootStateDataPool.capacity()) { mLinkAndJointAndRootStateDataPool.forceSize_Unsafe(0); mLinkAndJointAndRootStateDataPool.reserve(totalLinkJointRootStateByteSize); } if (articulationCount > mArticulationSleepDataPool.capacity()) { mArticulationSleepDataPool.forceSize_Unsafe(0); mArticulationSleepDataPool.reserve(articulationCount); } if (articulationCount*2 > mInternalResidualPerArticulationVelIter.capacity()) { mInternalResidualPerArticulationVelIter.forceSize_Unsafe(0); mInternalResidualPerArticulationVelIter.reserve(articulationCount*2); } if (articulationCount*2 > mInternalResidualPerArticulationPosIter.capacity()) { mInternalResidualPerArticulationPosIter.forceSize_Unsafe(0); mInternalResidualPerArticulationPosIter.reserve(articulationCount*2); } mLinkAndJointAndRootStateDataPool.forceSize_Unsafe(totalLinkJointRootStateByteSize); mArticulationSleepDataPool.forceSize_Unsafe(articulationCount); mInternalResidualPerArticulationVelIter.forceSize_Unsafe(articulationCount * 2); mInternalResidualPerArticulationPosIter.forceSize_Unsafe(articulationCount * 2); //1: Allocate buffers for all bodies (kinematic + dynamic) if ((kinematicCount + bodyCount + 1) > mSolverBodyPool.capacity()) { //we don't need to dma up/back dynamic solver body data to gpu anymore. However, we still need to dma up static/kinematic solver body const PxU32 totalBodyAlignedCounts = (kinematicCount + bodyCount + 31 + 1) & (~31); mSolverBodyPool.forceSize_Unsafe(0); mSolverBodyPool.reserve(totalBodyAlignedCounts); mBody2WorldPool.forceSize_Unsafe(0); mBody2WorldPool.reserve(totalBodyAlignedCounts); mSolverBodyDataPool.forceSize_Unsafe(0); mSolverBodySleepDataPool.forceSize_Unsafe(0); mSolverBodySleepDataPool.reserve(totalBodyAlignedCounts); mSolverTxIDataPool.forceSize_Unsafe(0); mSolverTxIDataPool.reserve(totalBodyAlignedCounts); } if ((kinematicCount + bodyCount + 1 + articulationCount) > mActiveNodeIndex.capacity()) { const PxU32 totalArticulationAlignedCounts = (kinematicCount + bodyCount + 1 + articulationCount + 31) & (~31); mActiveNodeIndex.forceSize_Unsafe(0); mActiveNodeIndex.reserve(totalArticulationAlignedCounts); } if ((kinematicCount + 31 + 1) > mSolverBodyDataPool.capacity()) { mSolverBodyDataPool.reserve((kinematicCount + 31 + 1) & (~31)); } mActiveNodeIndex.forceSize_Unsafe(1 + kinematicCount + bodyCount + articulationCount); //Set up constraint batches const PxU32 totalBodySize = 1 + kinematicCount + bodyCount; mSolverBodyPool.forceSize_Unsafe(totalBodySize); mBody2WorldPool.forceSize_Unsafe(totalBodySize); //we don't need to create dynamic solver body data in cpu anymore mSolverBodyDataPool.forceSize_Unsafe(1 + kinematicCount); //we need to dma up static+kinematic part of the sleepData and we dma up the whole sleepData array mSolverBodySleepDataPool.forceSize_Unsafe(totalBodySize); mSolverTxIDataPool.forceSize_Unsafe(totalBodySize); } if (getEnableDirectGPUAPI()) { getSimulationController()->getJointManager().reserveMemoryPreAddRemove(); } if (needsSolve(islandSim, bodyCount, articulationCount)) { //Set up gpu workloads early!!! const PxNodeIndex* const PX_RESTRICT nodeIndices = islandSim.getActiveNodes(IG::Node::eRIGID_BODY_TYPE); const PxNodeIndex* const PX_RESTRICT articulationNodeIndices = islandSim.getActiveNodes(IG::Node::eARTICULATION_TYPE); PxMemCopy(mActiveNodeIndex.begin() + 1, islandSim.getActiveKinematics(), islandSim.getNbActiveKinematics() * sizeof(PxNodeIndex)); PxMemCopy(mActiveNodeIndex.begin() + 1 + kinematicCount, nodeIndices, sizeof(PxNodeIndex) * mBodyCount); PxMemCopy(mActiveNodeIndex.begin() + mArticulationStartIndex, articulationNodeIndices, sizeof(PxNodeIndex) * mArticulationCount); mActiveNodeIndex[0] = PxNodeIndex(); PxgSimulationController* controller = static_cast(mSimulationController); const PxU32 maxLinks = controller->getMaxLinks(); //DMA up the body data right now and any other data that might be available mGpuSolverCore->allocateSolverBodyBuffers(mIslandManager.getNbNodeHandles() + 1, mActiveNodeIndex, mArticulationCount, maxLinks); mSolvedThisFrame = true; //solver task chain! //Note - *all* work for *all* islands is processed in phases using a wide-model approach. //This is friendlier for the GPU but can be more wasteful in terms of memory mGpuTask.setContinuation(continuation); mGpuPrePrepTask.setContinuation(&mGpuTask); mPrepTask.setContinuation(&mGpuPrePrepTask); mPreIntegrationTask.setContinuation(&mPrepTask); mGpuIntegrationTask.setContinuation(&mGpuPrePrepTask); //Set up world rigid body mSolverBodyPool[0] = mWorldSolverBody; mSolverBodyDataPool[0] = mWorldSolverBodyData; mSolverTxIDataPool[0] = mWorldTxIData; mSolverBodySleepDataPool[0] = mWorldSolverBodySleepData; // these two are being launched immediately. mGpuIntegrationTask.removeReference(); mPreIntegrationTask.removeReference(); } // PT: when updateIncrementalIslands() is single-threaded this is a blocking call and we can use the // partitioning data when it returns. This is not the case anymore with multi-threaded implementations. // doConstraintPrePrepCommon() consumes the output of the incremental island building as part of mPrepTask mIncrementalPartition.updateIncrementalIslands( mIslandManager.getAccurateIslandSim(), mIslandManager.getAuxCpuData(), &flushPool, postPartitioningTask, mOutputIterator, // PT: don't pass the local variable, it will go out of scope while the partitioning tasks are using it getSimulationController()->getBodySimManager(), getSimulationController()->getJointManager()); // PT: all the code after the updateIncrementalIslands() call has been moved to PxgGpuContext::updatePostPartitioning() where // it can safely be executed after the potential updateIncrementalIslands() tasks are completed. mGpuSolverCore->releaseContext(); } void PxgGpuContext::updatePostPartitioning(PxBaseTask* lostTouchTask, PxvNphaseImplementationContext* /*nphase*/, PxU32 maxPatchesPerCM, PxU32 /*maxArticulationLinks*/, PxReal /*dt*/, const PxVec3& /*gravity*/, PxBitMapPinned& changedHandleMap) { mGpuSolverCore->acquireContext(); IG::IslandSim& islandSim = mIslandManager.getAccurateIslandSim(); const PxPinnedArray& partitionIndexDataIter = mIncrementalPartition.getPartitionIndexArray(); const PxPinnedArray& partitionNodeData = mIncrementalPartition.getPartitionNodeArray(); const PxPinnedArray& solverConstantData = mIncrementalPartition.getSolverConstants(); const PxInt32ArrayPinned& partitionStartBatchIndexIter = mIncrementalPartition.getStartSlabPerPartition(); const PxInt32ArrayPinned& partitionArticStartBatchIndexIter = mIncrementalPartition.getArticStartSlabPerPartition(); const PxInt32ArrayPinned& partitionJointBatchCountIter = mIncrementalPartition.getNbJointsPerPartition(); const PxInt32ArrayPinned& partitionArtiJointBatchCountIter = mIncrementalPartition.getNbArticJointsPerPartition(); const PxArray& npIndexArrayIter = mIncrementalPartition.getNpIndexArray(); PxInt32ArrayPinned& npIndexArrayStagingBuffer = mNodeIndicesStagingBuffer; PxInt32ArrayPinned& islandIds = mIslandIds; PxInt32ArrayPinned& islandStaticTouchCounts = mIslandStaticTouchCounts; const PxU32 nbConstraints = islandSim.getNbActiveEdges(IG::Edge::eCONSTRAINT); // At this point we are ready to allocate the pinned memory for the solver. allocateTempPinnedSolverMemoryCommon(); const PxU32 bodyCount = mBodyCount; const PxU32 kinematicCount = mKinematicCount; const PxU32 articulationCount = mArticulationCount; //Force all bodies into a single island. The GPU partitioning provides better work balancing between blocks than just using multiple islands. PxgIslandContext& context = mIslandContextPool[0]; context.mBodyStartIndex = 1 + kinematicCount; context.mBodyCount = bodyCount; context.mArticulationCount = articulationCount; context.mNumPositionIterations = context.mNumVelocityIterations = 0; mNumIslandContextPool = 1; //because updateIncrementalIslands add/remove joints based on activation getSimulationController()->updateJointsAndSyncData(); //reset number of frozen/unfrozen shapes to be zero mSimulationController->clear(); PxgJointManager& jointManager = getSimulationController()->getJointManager(); PX_ASSERT((jointManager.getCpuNbRigidConstraints() + jointManager.getCpuNbArtiConstraints() + jointManager.getGpuNbActiveRigidConstraints() + jointManager.getGpuNbActiveArtiConstraints()) == nbConstraints); PX_UNUSED(jointManager); const PxU32 nbPatches = mIncrementalPartition.getTotalContacts(); // PT: same as what mIncrementalPartition.updateIncrementalIslands() returned #if PX_ENABLE_ASSERTS PxU32 accumulatedConstraints = mIncrementalPartition.getAccumulatedConstraintCount().size() == 0 ? 0 : mIncrementalPartition.getAccumulatedConstraintCount()[mIncrementalPartition.getAccumulatedConstraintCount().size() - 1]; PxU32 accumulatedArtiConstraints = mIncrementalPartition.getAccumulatedArtiConstraintCount().size() == 0 ? 0 : mIncrementalPartition.getAccumulatedArtiConstraintCount()[mIncrementalPartition.getAccumulatedArtiConstraintCount().size() - 1]; PX_ASSERT((nbPatches + islandSim.getNbActiveEdges(IG::Edge::eCONSTRAINT) + mIncrementalPartition.getTotalArticulationContacts()) == (accumulatedConstraints + accumulatedArtiConstraints + getSimulationController()->getBodySimManager().mTotalStaticArticJoints + getSimulationController()->getBodySimManager().mTotalSelfArticJoints + getSimulationController()->getBodySimManager().mTotalStaticRBJoints)); #endif { PX_PROFILE_ZONE("Dynamics.allocateConstraintBuffers", 0); //set the constraint batches number but we will do the actual memory allocation in doPartitionTask() method and free the excess amout in doConstraintPrePrepCommon(), so that //we can make sure mConstraintBatches is the last element allocated in the pinned memory allocator, therefore, we can shrunk the excess memory safely //mNumConstraintBatches = sentinel->constraints + sentinel->contactManagers; PxgBodySimManager& bodyManager = getSimulationController()->getBodySimManager(); mNumContactManagers = nbPatches + bodyManager.mTotalStaticRBContacts; mNum1DConstraints = nbConstraints + bodyManager.mTotalStaticRBJoints; mThresholdStream->forceSize_Unsafe(0); mThresholdStream->reserve(PxNextPowerOfTwo(mNumContactManagers)); mForceChangedThresholdStream->forceSize_Unsafe(0); mForceChangedThresholdStream->reserve(PxNextPowerOfTwo(mNumContactManagers)); //Set up constraint batches //If there is no work to do then we can do nothing at all. // AD: this only works because we have the same if when setting up the task chain. // it's also in a somewhat weird place. We should analyze the dependencies, is all of the work we're doing up to here actually // required to happen even if we early-out here? if (!needsSolve(islandSim, bodyCount, articulationCount)) { mGpuSolverCore->releaseContext(); return; } //printf("NbarticBatches = %i, NbRigidBatches = %i\n", mIncrementalPartition.mNbArtiContactBatches, mIncrementalPartition.mNbContactBatches); } PxU32 descCount = 0; PxU32 currentDescIndex = 0; mGpuSolverCore->resetMemoryAllocator(); PxU32 totalEdges = mIslandManager.getNbEdgeHandles(); mTotalPreviousEdges = mTotalEdges; mTotalEdges = totalEdges; mGpuSolverCore->allocateFrictionPatchIndexStream(totalEdges * maxPatchesPerCM); //How many batches mGpuSolverCore->allocateFrictionCounts(totalEdges); currentDescIndex = mIncrementalPartition.getTotalConstraints() + mIncrementalPartition.getTotalContacts(); context.mDescCount = currentDescIndex; context.mDescStartIndex = descCount; descCount += currentDescIndex; lostTouchTask->addReference(); mLostTouchTask = lostTouchTask; npIndexArrayStagingBuffer.forceSize_Unsafe(0); npIndexArrayStagingBuffer.reserve(npIndexArrayIter.size()); npIndexArrayStagingBuffer.forceSize_Unsafe(npIndexArrayIter.size()); islandIds.forceSize_Unsafe(0); islandIds.reserve(islandSim.getNbNodes()); islandIds.forceSize_Unsafe(islandSim.getNbNodes()); islandStaticTouchCounts.forceSize_Unsafe(0); islandStaticTouchCounts.reserve(islandSim.getNbIslands()); islandStaticTouchCounts.forceSize_Unsafe(islandSim.getNbIslands()); //npIndexArray might be changed in island gen while solver is running, so we need to double buffer it PxMemCopy(npIndexArrayStagingBuffer.begin(), npIndexArrayIter.begin(), sizeof(PxU32) * npIndexArrayIter.size()); PxMemCopy(islandIds.begin(), islandSim.getIslandIds(), sizeof(PxU32) * islandSim.getNbNodes()); PxMemCopy(islandStaticTouchCounts.begin(), islandSim.getIslandStaticTouchCount(), sizeof(PxU32) * islandSim.getNbIslands()); const PxInt32ArrayPinned& nodeInteractions = mIncrementalPartition.getNodeInteractionCountArray(); mGpuSolverCore->gpuMemDMAUpContactData(mContactStreamAllocators[mCurrentContactStream], PxToU32(mContactStreamPool.mSharedDataIndex), mContactStreamPool.mSharedDataIndexGPU, mPatchStreamAllocators[mCurrentContactStream], PxToU32(mPatchStreamPool.mSharedDataIndex), mPatchStreamPool.mSharedDataIndexGPU, mNumContactManagers, partitionIndexDataIter.begin(), partitionNodeData.begin(), solverConstantData.begin(), solverConstantData.size(), partitionIndexDataIter.size(), partitionStartBatchIndexIter.begin(), partitionArticStartBatchIndexIter.begin(), partitionJointBatchCountIter.begin(), partitionArtiJointBatchCountIter.begin(), partitionStartBatchIndexIter.size(), mIncrementalPartition.getDestroyedContactEdgeIndices().begin(), mIncrementalPartition.getDestroyedContactEdgeIndices().size(), npIndexArrayStagingBuffer.begin(), npIndexArrayStagingBuffer.size(), /*jointManager.mGpuJointData, jointManager.mGpuJointPrePrep, gpuJointSize,*/ mConstraintWriteBackPool.size(), islandIds.begin(), nodeInteractions.begin(), islandIds.size(), islandStaticTouchCounts.begin(), islandStaticTouchCounts.size()); mGpuSolverCore->releaseContext(); mGpuTask.setMaxNodesAndWordCounts(mIslandManager.getNbNodeHandles(), changedHandleMap); //Now we have kicked off all the atom integration and pre-prep work, so we can permit the remaining phases of the solver to run... //mPostSolveTask.removeReference(); mGpuTask.removeReference(); mGpuPrePrepTask.removeReference(); mPrepTask.removeReference(); } }