1280 lines
55 KiB
C++
1280 lines
55 KiB
C++
// Redistribution and use in source and binary forms, with or without
|
|
// modification, are permitted provided that the following conditions
|
|
// are met:
|
|
// * Redistributions of source code must retain the above copyright
|
|
// notice, this list of conditions and the following disclaimer.
|
|
// * Redistributions in binary form must reproduce the above copyright
|
|
// notice, this list of conditions and the following disclaimer in the
|
|
// documentation and/or other materials provided with the distribution.
|
|
// * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
// contributors may be used to endorse or promote products derived
|
|
// from this software without specific prior written permission.
|
|
//
|
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
|
|
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
//
|
|
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
|
|
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
|
|
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
|
|
|
|
#include "foundation/PxAllocator.h"
|
|
#include "foundation/PxTime.h"
|
|
#include "foundation/PxMemory.h"
|
|
#include "foundation/PxSort.h"
|
|
|
|
#include "common/PxProfileZone.h"
|
|
#include "PxvSimStats.h"
|
|
|
|
#include "PxgCudaBroadPhaseSap.h"
|
|
|
|
#include "PxgBroadPhaseKernelIndices.h"
|
|
#include "PxgIntegerAABB.h"
|
|
#include "PxgBroadPhasePairReport.h"
|
|
#include "BpBroadPhaseUpdate.h"
|
|
#include "PxgSapBox1D.h"
|
|
#include "PxgRadixSortDesc.h"
|
|
#include "PxgCudaMemoryAllocator.h"
|
|
#include "PxgKernelWrangler.h"
|
|
#include "PxgKernelIndices.h"
|
|
#include "PxSceneDesc.h"
|
|
#include "PxgCudaUtils.h"
|
|
#include "PxgRadixSortKernelIndices.h"
|
|
#include "PxgAABBManager.h"
|
|
|
|
#include "PxgContext.h"
|
|
#include "PxgSimulationCore.h"
|
|
|
|
#include "CudaKernelWrangler.h"
|
|
#include "cudamanager/PxCudaContext.h"
|
|
|
|
#include "cudamanager/PxCudaContextManager.h"
|
|
|
|
#include "PxgKernelLauncher.h"
|
|
|
|
// PT: TODO:
|
|
// - most of these functions don't need to be member functions
|
|
|
|
#define GPU_BP_DEBUG 0
|
|
#define USE_NEW_LAUNCH_FUNCTION 1
|
|
|
|
#if GPU_BP_DEBUG
|
|
#define GPU_DEBUG_STREAM(s, x) \
|
|
{ \
|
|
const CUresult err = mCudaContext->streamSynchronize(s); \
|
|
if(err != CUDA_SUCCESS) \
|
|
outputError<PxErrorCode::eINTERNAL_ERROR>(__LINE__, x); \
|
|
}
|
|
#else
|
|
#define GPU_DEBUG_STREAM(s, x)
|
|
#endif
|
|
|
|
#define PROLOG mGpuKernelWranglerManager->mKernelWrangler, mCudaContext
|
|
#if USE_NEW_LAUNCH_FUNCTION
|
|
#define KERNEL_PARAM_TYPE void*
|
|
#define CUDA_KERNEL_PARAM PX_CUDA_KERNEL_PARAM2
|
|
#define EPILOG mStream, kernelParams, PX_FL
|
|
#else
|
|
#define KERNEL_PARAM_TYPE PxCudaKernelParam
|
|
#define CUDA_KERNEL_PARAM PX_CUDA_KERNEL_PARAM
|
|
#define EPILOG mStream, kernelParams, sizeof(kernelParams), PX_FL
|
|
#endif
|
|
|
|
using namespace physx;
|
|
|
|
PX_IMPLEMENT_OUTPUT_ERROR
|
|
|
|
PxgCudaBroadPhaseSap::PxgCudaBroadPhaseSap(const PxGpuBroadPhaseDesc& desc, PxgCudaKernelWranglerManager* gpuKernelWrangler, PxCudaContextManager* cudaContextManager, const PxGpuDynamicsMemoryConfig& init, PxgHeapMemoryAllocatorManager* heapMemoryManager, PxU64 contextID) :
|
|
Bp::BroadPhase (),
|
|
|
|
mContextID (contextID),
|
|
|
|
mDesc (desc),
|
|
|
|
mNumOfBoxes (0),
|
|
mUpdateData_CreatedHandleSize (0),
|
|
mUpdateData_RemovedHandleSize (0),
|
|
#ifdef SUPPORT_UPDATE_HANDLES_ARRAY_FOR_GPU
|
|
mUpdateData_UpdatedHandleSize (0),
|
|
#endif
|
|
mUpdateData_BoxesCapacity (0),
|
|
|
|
mGpuKernelWranglerManager (gpuKernelWrangler),
|
|
mCudaContextManager (cudaContextManager),
|
|
mCudaContext (cudaContextManager->getCudaContext()),
|
|
mHeapMemoryManager (heapMemoryManager),
|
|
mCreatedHandlesBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mRemovedHandlesBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
#ifdef SUPPORT_UPDATE_HANDLES_ARRAY_FOR_GPU
|
|
// PT: looks like this stuff used to be here but got removed for some reason!
|
|
mUpdatedHandlesBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
#endif
|
|
mBoxFpBoundsBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mBoxContactDistancesBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mBoxGroupsBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mBoxEnvIDsBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mNewIntegerBoundsBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mOldIntegerBoundsBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mBoxPtProjectionsBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mBoxProjectionRanksBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mBoxPtHandlesBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mTempBoxPtProjectionBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mTempBoxPtHandlesBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mRadixCountBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mBoxSapBox1DBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mNewBoxSapBox1DBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mEndPtHistogramBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mBlockEndPtHistogramBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mEndPtHandleBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mStartPtHistogramBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mBlockStartPtHistogramBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mStartPtHandleBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mTotalEndPtHistogramBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mBlockTotalEndPtHistogramBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mActiveRegionTotalBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mStartRegionsTotalBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mOrderedActiveRegionHandlesTotalBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mOrderedStartRegionHandlesTotalBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mOverlapChecksRegionBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mBlockOverlapChecksRegionBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mOverlapChecksHandleRegionBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mIncrementalComparisons (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mIncrementalBlockComparisons (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mAggregateReportBlockBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mActorReportBlockBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mRegionRangeBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mStartRegionAccumBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mBlockStartRegionAccumBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mRegionAccumBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mBlockRegionAccumBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mFoundPairsBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mLostPairsBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mFoundAggregateBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mLostAggregateBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mFoundActorBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mLostActorBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mBPDescBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mRadixSortDescBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mRadixSortWORDescBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
|
|
mPinnedEvent (NULL),
|
|
mBpDesc (NULL),
|
|
mRSDesc (NULL),
|
|
mRSDescWOR (NULL),
|
|
mFoundActorPairs (PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators, PxsHeapStats::eBROADPHASE)),
|
|
mLostActorPairs (PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators, PxsHeapStats::eBROADPHASE)),
|
|
mMaxFoundLostPairs (init.foundLostPairsCapacity),
|
|
mMaxAggFoundLostPairs (init.foundLostAggregatePairsCapacity),
|
|
mAABBManager (NULL),
|
|
#if PX_ENABLE_SIM_STATS
|
|
mFoundLostPairsStats (0),
|
|
#else
|
|
PX_CATCH_UNDEFINED_ENABLE_SIM_STATS
|
|
#endif
|
|
mForceUpdate (true)
|
|
{
|
|
PxScopedCudaLock _lock_(*mCudaContextManager);
|
|
|
|
for(PxU32 i = 0; i < 3; ++i)
|
|
mRadixCountBuf[i].allocate(sizeof(PxU32) * PxgRadixSortKernelGridDim::RADIX_SORT * 16, PX_FL);
|
|
|
|
mBPDescBuf.allocate(sizeof(PxgBroadPhaseDesc), PX_FL);
|
|
|
|
mRadixSortDescBuf.allocate(sizeof(PxgRadixSortDesc)*6, PX_FL);
|
|
mRadixSortWORDescBuf.allocate(sizeof(PxgRadixSortDesc)*6, PX_FL);
|
|
|
|
mBpDesc = reinterpret_cast<PxgBroadPhaseDesc*>(mHeapMemoryManager->mMappedMemoryAllocators->allocate(sizeof(PxgBroadPhaseDesc), PxsHeapStats::eBROADPHASE, PX_FL));
|
|
mRSDesc = reinterpret_cast<PxgRadixSortDesc*>(mHeapMemoryManager->mMappedMemoryAllocators->allocate(sizeof(PxgRadixSortDesc) * 6, PxsHeapStats::eBROADPHASE, PX_FL));
|
|
mRSDescWOR = reinterpret_cast<PxgRadixSortDesc*>(mHeapMemoryManager->mMappedMemoryAllocators->allocate(sizeof(PxgRadixSortDesc) * 6, PxsHeapStats::eBROADPHASE, PX_FL));
|
|
|
|
mRegionAccumTotal = 0;
|
|
mOverlapChecksTotalRegion = 0;
|
|
mStartRegionAccumTotal = 0;
|
|
|
|
mFoundPairsBuf.allocate(mMaxFoundLostPairs * sizeof(PxgBroadPhasePair), PX_FL);
|
|
mLostPairsBuf.allocate(mMaxFoundLostPairs * sizeof(PxgBroadPhasePair), PX_FL);
|
|
|
|
mFoundAggregateBuf.allocate(mMaxAggFoundLostPairs * sizeof(PxgBroadPhasePair), PX_FL);
|
|
mLostAggregateBuf.allocate(mMaxAggFoundLostPairs * sizeof(PxgBroadPhasePair), PX_FL);
|
|
|
|
mFoundActorBuf.allocate(mMaxFoundLostPairs * sizeof(PxgBroadPhasePair), PX_FL);
|
|
mLostActorBuf.allocate(mMaxFoundLostPairs * sizeof(PxgBroadPhasePair), PX_FL);
|
|
|
|
mFoundActorPairs.forceSize_Unsafe(0);
|
|
mFoundActorPairs.reserve(mMaxFoundLostPairs);
|
|
|
|
mLostActorPairs.forceSize_Unsafe(0);
|
|
mLostActorPairs.reserve(mMaxFoundLostPairs);
|
|
|
|
createGpuStreamsAndEvents();
|
|
}
|
|
|
|
PxgCudaBroadPhaseSap::~PxgCudaBroadPhaseSap()
|
|
{
|
|
PxScopedCudaLock _lock_(*mCudaContextManager);
|
|
|
|
mHeapMemoryManager->mMappedMemoryAllocators->deallocate(mBpDesc);
|
|
mHeapMemoryManager->mMappedMemoryAllocators->deallocate(mRSDesc);
|
|
mHeapMemoryManager->mMappedMemoryAllocators->deallocate(mRSDescWOR);
|
|
|
|
releaseGpuStreamsAndEvents();
|
|
}
|
|
|
|
void PxgCudaBroadPhaseSap::release()
|
|
{
|
|
this->~PxgCudaBroadPhaseSap();
|
|
PX_FREE_THIS;
|
|
}
|
|
|
|
void PxgCudaBroadPhaseSap::createGpuStreamsAndEvents()
|
|
{
|
|
int leastPriority, mostPriority;
|
|
cuCtxGetStreamPriorityRange(&leastPriority, &mostPriority);
|
|
|
|
CUresult result = mCudaContext->streamCreateWithPriority(&mStream, CU_STREAM_NON_BLOCKING, mostPriority);
|
|
|
|
if (result != CUDA_SUCCESS)
|
|
outputError<PxErrorCode::eINTERNAL_ERROR>(__LINE__, "GPU Create Stream 0 fail!!\n");
|
|
|
|
result = mCudaContext->eventCreate(&mEvent, CU_EVENT_DISABLE_TIMING);
|
|
|
|
mPinnedEvent = PX_PINNED_MEMORY_ALLOC(PxU32, *mCudaContextManager, 1);
|
|
|
|
if (result != CUDA_SUCCESS)
|
|
outputError<PxErrorCode::eINTERNAL_ERROR>(__LINE__, "GPU Create Event 0 fail!!\n");
|
|
}
|
|
|
|
void PxgCudaBroadPhaseSap::releaseGpuStreamsAndEvents()
|
|
{
|
|
//destroy stream
|
|
mCudaContext->streamDestroy(mStream);
|
|
mStream = NULL;
|
|
|
|
PX_PINNED_MEMORY_FREE(*mCudaContextManager, mPinnedEvent);
|
|
|
|
//destroy event
|
|
mCudaContext->eventDestroy(mEvent);
|
|
mEvent = NULL;
|
|
}
|
|
|
|
void PxgCudaBroadPhaseSap::gpuDMAUp(const Bp::BroadPhaseUpdateData& updateData, PxgBroadPhaseDesc& bpDesc, PxgRadixSortDesc* rsDescs)
|
|
{
|
|
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.gpuDMAUp", mContextID);
|
|
|
|
//mCudaContext->memcpyHtoDAsync(constraintsPerPartitiond, constraintsPerPartitionIter.begin(), sizeof(PxU32) * numConstraintsPerPartition, mStream);
|
|
|
|
mUpdateData_RemovedHandleSize = updateData.getNumRemovedHandles();
|
|
mUpdateData_CreatedHandleSize = updateData.getNumCreatedHandles();
|
|
#ifdef SUPPORT_UPDATE_HANDLES_ARRAY_FOR_GPU
|
|
mUpdateData_UpdatedHandleSize = updateData.getNumUpdatedHandles();
|
|
#endif
|
|
|
|
//mContactDistances = updateData.getContactDistance();
|
|
//mBoxBoundsMinMax = updateData.getAABBs();
|
|
//mBoxGroups = updateData.getGroups();
|
|
|
|
mUpdateData_BoxesCapacity = updateData.getCapacity();
|
|
|
|
mNumOfBoxes = mNumOfBoxes + mUpdateData_CreatedHandleSize - mUpdateData_RemovedHandleSize;
|
|
|
|
//We need to add on removedHandleSize because these handles are temporarily also in the projection buffer
|
|
const PxU32 nbProjections = (mNumOfBoxes + mUpdateData_RemovedHandleSize) * 2;
|
|
const PxU32 paddedProjections = (nbProjections + 3)&(~3);
|
|
|
|
mOldIntegerBoundsBuf.allocateCopyOldDataAsync(mUpdateData_BoxesCapacity * sizeof(PxgIntegerAABB), mCudaContext, mStream, PX_FL);
|
|
|
|
//we need to allocate enough memory (x4) for the radix sort because each thread read 4 elements
|
|
|
|
for (PxU32 i = 0; i < 3; ++i)
|
|
{
|
|
mBoxSapBox1DBuf[i].allocateCopyOldDataAsync(mUpdateData_BoxesCapacity * sizeof(PxgSapBox1D), mCudaContext, mStream, PX_FL);
|
|
mNewBoxSapBox1DBuf[i].allocate(mUpdateData_BoxesCapacity * sizeof(PxgSapBox1D), PX_FL);
|
|
|
|
mBoxPtProjectionsBuf[i].allocateCopyOldDataAsync(paddedProjections * sizeof(int), mCudaContext, mStream, PX_FL);
|
|
|
|
mBoxProjectionRanksBuf[i].allocate(paddedProjections * sizeof(int), PX_FL);
|
|
mTempBoxPtProjectionBuf[i].allocate(paddedProjections * sizeof(int), PX_FL);
|
|
mTempBoxPtHandlesBuf[i].allocate(paddedProjections * sizeof(int), PX_FL);
|
|
|
|
for (PxU32 j = 0; j < 2; ++j)
|
|
{
|
|
//mEndPtHistogramBuf[j][i].allocateCopyOldDataAsync(nbProjections * sizeof(int), mStreams.begin(), mHeapMemoryManager);
|
|
//mBlockEndPtHistogramBuf[j][i].allocateCopyOldDataAsync(PxgBPKernelGridDim::BP_OUTPUT_ENDPT_HISTOGRAM * sizeof(int), mStreams.begin(), mHeapMemoryManager); // 32 block
|
|
//mEndPtHandleBuf[j][i].allocateCopyOldDataAsync(mNumOfBoxes*sizeof(int), mStreams.begin(), mHeapMemoryManager);
|
|
|
|
//mStartPtHistogramBuf[j][i].allocateCopyOldDataAsync(nbProjections * sizeof(int), mStreams.begin(), mHeapMemoryManager);
|
|
//mBlockStartPtHistogramBuf[j][i].allocateCopyOldDataAsync(PxgBPKernelGridDim::BP_OUTPUT_ENDPT_HISTOGRAM * sizeof(int), mStreams.begin(), mHeapMemoryManager); // 32 block
|
|
//mStartPtHandleBuf[j][i].allocateCopyOldDataAsync(mNumOfBoxes*sizeof(int), mStreams.begin(), mHeapMemoryManager);
|
|
|
|
const PxU32 index = j * 3 + i;
|
|
|
|
mBoxPtHandlesBuf[index].allocateCopyOldDataAsync(paddedProjections * sizeof(int), mCudaContext, mStream, PX_FL);
|
|
|
|
mEndPtHistogramBuf[index].allocateCopyOldDataAsync(nbProjections * sizeof(int), mCudaContext, mStream, PX_FL);
|
|
mBlockEndPtHistogramBuf[index].allocateCopyOldDataAsync(PxgBPKernelGridDim::BP_OUTPUT_ENDPT_HISTOGRAM * sizeof(int), mCudaContext, mStream, PX_FL); // 32 block
|
|
mEndPtHandleBuf[index].allocateCopyOldDataAsync(mNumOfBoxes*sizeof(int), mCudaContext, mStream, PX_FL);
|
|
|
|
mStartPtHistogramBuf[index].allocateCopyOldDataAsync(nbProjections * sizeof(int), mCudaContext, mStream, PX_FL);
|
|
mBlockStartPtHistogramBuf[index].allocateCopyOldDataAsync(PxgBPKernelGridDim::BP_OUTPUT_ENDPT_HISTOGRAM * sizeof(int), mCudaContext, mStream, PX_FL); // 32 block
|
|
mStartPtHandleBuf[index].allocateCopyOldDataAsync(mNumOfBoxes*sizeof(int), mCudaContext, mStream, PX_FL);
|
|
}
|
|
|
|
mIncrementalComparisons[i].allocate(nbProjections* sizeof(int), PX_FL);
|
|
mIncrementalBlockComparisons[i].allocate(PxgBPKernelGridDim::BP_COMPUTE_INCREMENTAL_CMP_COUNTS1 * sizeof(int), PX_FL);
|
|
|
|
mTotalEndPtHistogramBuf[i].allocate(nbProjections*sizeof(PxU32), PX_FL);
|
|
mBlockTotalEndPtHistogramBuf[i].allocate(PxgBPKernelGridDim::BP_OUTPUT_ENDPT_HISTOGRAM*sizeof(PxU32), PX_FL);
|
|
}
|
|
|
|
for (PxU32 i = 0; i < 2; ++i)
|
|
{
|
|
mAggregateReportBlockBuf[i].allocate(32 * sizeof(PxU32), PX_FL);
|
|
mActorReportBlockBuf[i].allocate(32 * sizeof(PxU32), PX_FL);
|
|
}
|
|
|
|
//each thread read 4 elements so we need to allocate enough memory for it
|
|
const PxU32 totalNbProjectionRegions = (nbProjections * 64 + 3)&(~3);
|
|
|
|
mActiveRegionTotalBuf.allocate(totalNbProjectionRegions * sizeof(int), PX_FL);
|
|
mStartRegionsTotalBuf.allocate(totalNbProjectionRegions * sizeof(int), PX_FL);
|
|
mOrderedActiveRegionHandlesTotalBuf.allocate(totalNbProjectionRegions * sizeof(int), PX_FL);
|
|
mOrderedStartRegionHandlesTotalBuf.allocate(totalNbProjectionRegions * sizeof(int), PX_FL);
|
|
|
|
mOverlapChecksRegionBuf.allocate(64 * mNumOfBoxes * sizeof(regionOverlapType), PX_FL);
|
|
mBlockOverlapChecksRegionBuf.allocate(PxgBPKernelGridDim::BP_OUTPUT_OVERLAPCHECKS_HISTOGRAM * sizeof(regionOverlapType), PX_FL);
|
|
mOverlapChecksHandleRegionBuf.allocate(64 * mNumOfBoxes * sizeof(PxgHandleRegion), PX_FL);
|
|
mRegionRangeBuf.allocate(mUpdateData_BoxesCapacity * sizeof(PxgIntegerRegion), PX_FL);
|
|
mStartRegionAccumBuf.allocate(nbProjections * sizeof(int), PX_FL);
|
|
mBlockStartRegionAccumBuf.allocate(PxgBPKernelGridDim::BP_OUTPUT_START_REGION_HISTOGRAM * sizeof(int), PX_FL);
|
|
|
|
mRegionAccumBuf.allocate(nbProjections * sizeof(int), PX_FL);
|
|
mBlockRegionAccumBuf.allocate(PxgBPKernelGridDim::BP_OUTPUT_REGION_HISTOGRAM * sizeof(int), PX_FL);
|
|
|
|
//allocate enough memory for GPU. All this data is input from the AABB manager this frame
|
|
mCreatedHandlesBuf.allocate(mUpdateData_CreatedHandleSize * sizeof(PxU32), PX_FL);
|
|
mRemovedHandlesBuf.allocate(mUpdateData_RemovedHandleSize * sizeof(PxU32), PX_FL);
|
|
#ifdef SUPPORT_UPDATE_HANDLES_ARRAY_FOR_GPU
|
|
mUpdatedHandlesBuf.allocate(mUpdateData_UpdatedHandleSize * sizeof(PxU32), PX_FL);
|
|
#endif
|
|
|
|
mNewIntegerBoundsBuf.allocate(mUpdateData_BoxesCapacity * sizeof(PxgIntegerAABB), PX_FL);
|
|
|
|
//mBoxFpBoundsBuf and mBoxContactDistancesBuf need to be allocated before particle updateBound kernel
|
|
//we move the allocation to gpuDmaUpSharedData. Particle system don't need mBoxGroupsBuf. However, if
|
|
//we dma up those three buffers based on the state changed so it will make sense to group those buffer together
|
|
//mBoxFpBoundsBuf.allocate(mBoxesCapacity * sizeof(PxBounds3));
|
|
//mBoxContactDistancesBuf.allocate(mBoxesCapacity * sizeof(PxReal));
|
|
//mBoxGroupsBuf.allocate(mBoxesCapacity * sizeof(PxU32));
|
|
|
|
updateDescriptor(bpDesc);
|
|
updateRadixSortDesc(rsDescs);
|
|
|
|
//DMA the update data to GPU
|
|
mCudaContext->memcpyHtoDAsync(mCreatedHandlesBuf.getDevicePtr(), updateData.getCreatedHandles(), sizeof(int) * mUpdateData_CreatedHandleSize, mStream);
|
|
mCudaContext->memcpyHtoDAsync(mRemovedHandlesBuf.getDevicePtr(), updateData.getRemovedHandles(), sizeof(int) * mUpdateData_RemovedHandleSize, mStream);
|
|
#ifdef SUPPORT_UPDATE_HANDLES_ARRAY_FOR_GPU
|
|
mCudaContext->memcpyHtoDAsync(mUpdatedHandlesBuf.getDevicePtr(), updateData.getUpdatedHandles(), sizeof(int) * mUpdateData_UpdatedHandleSize, mStream);
|
|
#endif
|
|
|
|
/*if(updateData.getStateChanged())
|
|
{
|
|
mCudaContext->memcpyHtoDAsync(mBoxContactDistancesBuf.getDevicePtr(), mContactDistances, sizeof(PxReal)* mBoxesCapacity, mStream);
|
|
mCudaContext->memcpyHtoDAsync(mBoxGroupsBuf.getDevicePtr(), mBoxGroups, sizeof(PxU32)* mBoxesCapacity, mStream);
|
|
mCudaContext->memcpyHtoDAsync(mBoxFpBoundsBuf.getDevicePtr(), mBoxBoundsMinMax, sizeof(PxBounds3)* mBoxesCapacity, mStream);
|
|
}*/
|
|
|
|
mCudaContext->memcpyHtoDAsync(mBPDescBuf.getDevicePtr(), (void*)&bpDesc, sizeof(PxgBroadPhaseDesc), mStream);
|
|
|
|
mCudaContext->memcpyHtoDAsync(mRadixSortDescBuf.getDevicePtr(), rsDescs, sizeof(PxgRadixSortDesc)*6, mStream);
|
|
mCudaContext->memcpyHtoDAsync(mRadixSortWORDescBuf.getDevicePtr(), mRSDescWOR, sizeof(PxgRadixSortDesc) * 6, mStream);
|
|
/*PxCudaStreamFlush(mStreams.begin());*/
|
|
|
|
#if GPU_BP_DEBUG
|
|
GPU_DEBUG_STREAM(mStream, "GPU radix sort fail!!\n")
|
|
|
|
mCudaContext->memcpyDtoH((void*)&bpDesc, mBPDescBuf.getDevicePtr(), sizeof(PxgBroadPhaseDesc));
|
|
#endif
|
|
}
|
|
|
|
void PxgCudaBroadPhaseSap::freeBuffers()
|
|
{
|
|
mLostActorPairs.forceSize_Unsafe(0);
|
|
mFoundActorPairs.forceSize_Unsafe(0);
|
|
}
|
|
|
|
void PxgCudaBroadPhaseSap::runCopyResultsKernel(PxgBroadPhaseDesc& /*desc*/)
|
|
{
|
|
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.runCopyResultsKernel", mContextID);
|
|
|
|
CUdeviceptr bpBuff = mBPDescBuf.getDevicePtr();
|
|
{
|
|
KERNEL_PARAM_TYPE kernelParams[] = { CUDA_KERNEL_PARAM(bpBuff) };
|
|
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_ACCUMULATE_REPORT_STAGE_1, PxgBPKernelGridDim::BP_COMPUTE_INCREMENTAL_CMP_COUNTS1, 4, 1, PxgBPKernelBlockDim::BP_COMPUTE_INCREMENTAL_CMP_COUNTS1, 1, 1, 0, EPILOG);
|
|
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_ACCUMULATE_REPORT_STAGE_2, PxgBPKernelGridDim::BP_COMPUTE_INCREMENTAL_CMP_COUNTS2, 4, 1, PxgBPKernelBlockDim::BP_COMPUTE_INCREMENTAL_CMP_COUNTS2, 1, 1, 0, EPILOG);
|
|
|
|
#if GPU_BP_DEBUG
|
|
/*mCudaContext->memcpyDtoHAsync((void*)&desc, mBPDescBuf.getDevicePtr(), sizeof(PxgBroadPhaseDesc), mStream);
|
|
resultR = mCudaContext->streamSynchronize(mStream);*/
|
|
#endif
|
|
}
|
|
|
|
{
|
|
KERNEL_PARAM_TYPE kernelParams[] = { CUDA_KERNEL_PARAM(bpBuff) };
|
|
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_COPY_REPORTS, PxgBPKernelGridDim::BP_COPY_REPORTS, 1, 1, PxgBPKernelBlockDim::BP_COPY_REPORTS, 1, 1, 0, EPILOG);
|
|
}
|
|
}
|
|
|
|
void PxgCudaBroadPhaseSap::gpuDMABack(const PxgBroadPhaseDesc& desc)
|
|
{
|
|
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.gpuDMABack", mContextID);
|
|
|
|
//mCudaContext->eventRecord(mEvent, mStream);
|
|
|
|
//PxCudaStreamFlush(mStreams.begin()); //KS - dispatch work!
|
|
|
|
{
|
|
CUdeviceptr bpBuff = mBPDescBuf.getDevicePtr();
|
|
|
|
mCudaContext->memcpyDtoHAsync((void*)&desc, bpBuff, sizeof(PxgBroadPhaseDesc), mStream);
|
|
//resultR = mCudaContext->streamSynchronize(mStream);
|
|
|
|
void* devicePtr = getMappedDevicePtr(mCudaContext, mPinnedEvent);
|
|
KERNEL_PARAM_TYPE kernelParams[] = { CUDA_KERNEL_PARAM(devicePtr) };
|
|
|
|
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_SIGNAL_COMPLETE, 1, 1, 1, 1, 1, 1, 0, EPILOG);
|
|
|
|
mCudaContext->streamFlush(mStream);
|
|
}
|
|
|
|
{
|
|
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.Synchronize", mContextID);
|
|
//mCudaContext->streamSynchronize(mStream);
|
|
volatile PxU32* eventPtr = mPinnedEvent;
|
|
|
|
if (!spinWait(*eventPtr, 0.1f))
|
|
mCudaContext->streamSynchronize(mStream);
|
|
}
|
|
|
|
mOverlapChecksTotalRegion = desc.overlapChecksTotalRegion;
|
|
mStartRegionAccumTotal = desc.startRegionAccumTotal;
|
|
mRegionAccumTotal = desc.regionAccumTotal;
|
|
|
|
// AD: some explanation about the counts here - just to reiterate:
|
|
//
|
|
// internally we have two lists, found pairs and lost pairs. Both contain actor-actor, actor-aggregate and
|
|
// aggregate-aggregate pairs. In the "report" phase, these two lists are built such that all the pairs involving
|
|
// aggregates are first, and actor pairs are after that.
|
|
//
|
|
// desc.sharedFound/LostPairIndex is the total size of the lists.
|
|
// desc.sharedFound/LostAggPairIndex is the number of aggregates in the list.
|
|
//
|
|
// but it gets more complicated. These counts overflow, but internally we only write until the maxLostFoundPairs
|
|
// value to make sure we're not going out of bounds. So the total value in the descriptor is only really useful
|
|
// if we are below the max, otherwise we need to correct.
|
|
//
|
|
// so in the end, the final number of pairs is PxMin(mMaxLostFoundPairs, desc.sharedFoundPairIndex) - desc.sharedFoundAggPairIndex.
|
|
// This works because the aggregate index is always smaller than the max index.
|
|
|
|
PX_ASSERT(desc.sharedFoundPairIndex >= desc.sharedFoundAggPairIndex);
|
|
PX_ASSERT(desc.sharedLostPairIndex >= desc.sharedLostAggPairIndex);
|
|
|
|
PxU32 foundLostPairsNeeded = PxMax(desc.sharedFoundPairIndex, desc.sharedLostPairIndex);
|
|
#if PX_ENABLE_SIM_STATS
|
|
mFoundLostPairsStats = PxMax(mFoundLostPairsStats, foundLostPairsNeeded);
|
|
#else
|
|
PX_CATCH_UNDEFINED_ENABLE_SIM_STATS
|
|
#endif
|
|
|
|
if (desc.found_lost_pairs_overflow_flags)
|
|
{
|
|
PxGetFoundation().error(PxErrorCode::eINVALID_PARAMETER, PX_FL,
|
|
"The application needs to increase PxGpuDynamicsMemoryConfig::foundLostPairsCapacity to %i, otherwise, the simulation will miss interactions\n", foundLostPairsNeeded);
|
|
}
|
|
|
|
mFoundActorPairs.forceSize_Unsafe(PxMin(mMaxFoundLostPairs, desc.sharedFoundPairIndex) - desc.sharedFoundAggPairIndex);
|
|
mLostActorPairs.forceSize_Unsafe(PxMin(mMaxFoundLostPairs, desc.sharedLostPairIndex) - desc.sharedLostAggPairIndex);
|
|
|
|
// AD: safety in case copyReports did not run due to abort mode
|
|
if (mCudaContext->isInAbortMode())
|
|
{
|
|
mFoundActorPairs.forceSize_Unsafe(0);
|
|
mLostActorPairs.forceSize_Unsafe(0);
|
|
}
|
|
}
|
|
|
|
struct ReportMore
|
|
{
|
|
bool operator()(const PxgBroadPhasePair& left, const PxgBroadPhasePair& right) const
|
|
{
|
|
return (left.mVolA > right.mVolA) ||
|
|
((left.mVolA == right.mVolA) && (left.mVolB > right.mVolB));
|
|
}
|
|
};
|
|
|
|
/*bool hasDuplicates(PxPinnedArray<PxgBroadPhasePair>& iterator)
|
|
{
|
|
for(PxU32 a = 1; a < iterator.size(); ++a)
|
|
{
|
|
PX_ASSERT(iterator[a].mVolA != iterator[a-1].mVolA ||
|
|
iterator[a].mVolB != iterator[a-1].mVolB);
|
|
}
|
|
return false;
|
|
}*/
|
|
|
|
void PxgCudaBroadPhaseSap::sortBuffer(PxgBroadPhasePair* PX_RESTRICT reportBuffer, const PxU32 size)
|
|
{
|
|
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap::sortBuffer", mContextID);
|
|
|
|
#if 1
|
|
PxSort(reportBuffer, size, ReportMore());
|
|
#else
|
|
const PxU32 SmallBufferLimit = 512;
|
|
|
|
if (size < SmallBufferLimit)
|
|
PxSort(reportBuffer, size, ReportMore());
|
|
else
|
|
{
|
|
//Histogram sort...
|
|
mHistogramBuffer.forceSize_Unsafe(0);
|
|
mHistogramBuffer.reserve(mBoxesCapacity);
|
|
mHistogramBuffer.forceSize_Unsafe(mBoxesCapacity);
|
|
|
|
PxMemZero(mHistogramBuffer.begin(), sizeof(PxU32) * mBoxesCapacity);
|
|
|
|
mTempPairBuffer.forceSize_Unsafe(0);
|
|
mTempPairBuffer.reserve(size);
|
|
mTempPairBuffer.forceSize_Unsafe(size);
|
|
|
|
for (PxU32 a = 0; a < size; ++a)
|
|
{
|
|
++mHistogramBuffer[reportBuffer[a].mVolA];
|
|
}
|
|
|
|
//Compute runsum
|
|
PxU32 runsum = 0;
|
|
PxU32 boxesCapacity = mBoxesCapacity;
|
|
//for (PxU32 a = 0; a < mBoxesCapacity; ++a)
|
|
while(boxesCapacity--)
|
|
{
|
|
PxU32 value = mHistogramBuffer[boxesCapacity];
|
|
mHistogramBuffer[boxesCapacity] = runsum;
|
|
runsum += value;
|
|
}
|
|
|
|
for (PxU32 a = 0; a < size; ++a)
|
|
{
|
|
PxU32 idx = mHistogramBuffer[reportBuffer[a].mVolA]++;
|
|
mTempPairBuffer[idx] = reportBuffer[a];
|
|
}
|
|
|
|
PxMemZero(mHistogramBuffer.begin(), sizeof(PxU32) * mBoxesCapacity);
|
|
|
|
for (PxU32 a = 0; a < size; ++a)
|
|
{
|
|
++mHistogramBuffer[reportBuffer[a].mVolB];
|
|
}
|
|
|
|
runsum = 0;
|
|
boxesCapacity = mBoxesCapacity;
|
|
//for (PxU32 a = 0; a < mBoxesCapacity; ++a)
|
|
while(boxesCapacity--)
|
|
{
|
|
PxU32 value = mHistogramBuffer[boxesCapacity];
|
|
mHistogramBuffer[boxesCapacity] = runsum;
|
|
runsum += value;
|
|
}
|
|
|
|
for (PxU32 a = 0; a < size; ++a)
|
|
{
|
|
PxU32 idx = mHistogramBuffer[mTempPairBuffer[a].mVolB]++;
|
|
reportBuffer[idx] = mTempPairBuffer[a];
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
|
|
void PxgCudaBroadPhaseSap::purgeDuplicates(PxPinnedArray<PxgBroadPhasePair>& pairs)
|
|
{
|
|
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.purgeDuplicates", mContextID);
|
|
|
|
const PxU32 nbPairs = pairs.size();
|
|
sortBuffer(pairs.begin(), nbPairs);
|
|
|
|
if (nbPairs)
|
|
{
|
|
PxU32 actor0 = pairs[0].mVolA;
|
|
PxU32 actor1 = pairs[0].mVolB;
|
|
PxU32 count = 1;
|
|
for (PxU32 i = 1; i < nbPairs; i++)
|
|
{
|
|
PxgBroadPhasePair& report1 = pairs[i];
|
|
|
|
PxU32 newActor0 = report1.mVolA;
|
|
PxU32 newActor1 = report1.mVolB;
|
|
|
|
if (newActor0 != actor0 || newActor1 != actor1)
|
|
{
|
|
if (count != i)
|
|
{
|
|
pairs[count].mVolA = newActor0;
|
|
pairs[count].mVolB = newActor1;
|
|
}
|
|
actor0 = newActor0;
|
|
actor1 = newActor1;
|
|
count++;
|
|
}
|
|
}
|
|
pairs.forceSize_Unsafe(count);
|
|
}
|
|
}
|
|
|
|
void PxgCudaBroadPhaseSap::purgeDuplicateFoundPairs()
|
|
{
|
|
purgeDuplicates(mFoundActorPairs);
|
|
}
|
|
|
|
void PxgCudaBroadPhaseSap::purgeDuplicateLostPairs()
|
|
{
|
|
purgeDuplicates(mLostActorPairs);
|
|
}
|
|
|
|
void PxgCudaBroadPhaseSap::runRadixSort(const PxU32 numOfKeys, CUdeviceptr radixSortDescBuf)
|
|
{
|
|
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.runRadixSort", mContextID);
|
|
|
|
PxU32 startBit = 0;
|
|
const PxU32 numPass = 8;
|
|
|
|
for(PxU32 i=0; i<numPass; ++i)
|
|
{
|
|
const PxU32 descIndex = (i & 1)*3;
|
|
|
|
CUdeviceptr rsDesc = radixSortDescBuf + descIndex*sizeof(PxgRadixSortDesc);
|
|
|
|
KERNEL_PARAM_TYPE kernelParams[] = { CUDA_KERNEL_PARAM(rsDesc), CUDA_KERNEL_PARAM(numOfKeys), CUDA_KERNEL_PARAM(startBit) };
|
|
|
|
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::RS_MULTIBLOCK_COUNT, PxgRadixSortKernelGridDim::RADIX_SORT, 3, 1, PxgRadixSortKernelBlockDim::RADIX_SORT, 1, 1, 0, EPILOG);
|
|
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::RS_CALCULATERANKS_MULTIBLOCK_COUNT, PxgRadixSortKernelGridDim::RADIX_SORT, 3, 1, PxgRadixSortKernelBlockDim::RADIX_SORT, 1, 1, 0, EPILOG);
|
|
|
|
startBit+=4;
|
|
}
|
|
|
|
GPU_DEBUG_STREAM(mStream, "GPU radix sort fail!!\n")
|
|
}
|
|
|
|
void PxgCudaBroadPhaseSap::sortProjectionAndHandlesWRKernel(PxU32 previousNumOfBoxes)
|
|
{
|
|
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.sortProjectionAndHandlesWRKernel", mContextID);
|
|
|
|
//PxU32 numHandles = mPreviousNumOfBoxes + mCreatedHandleSize;
|
|
//const PxU32 numHandles = mPreviousNumOfBoxes;
|
|
const PxU32 numHandles = previousNumOfBoxes;
|
|
|
|
if(numHandles == 0)
|
|
return;
|
|
|
|
PxU32 nbProjections = numHandles*2;
|
|
|
|
//we need to pad the number of projection to the multiply of 4
|
|
nbProjections = (nbProjections + 3) & (~3);
|
|
|
|
CUdeviceptr bpBuff = mBPDescBuf.getDevicePtr();
|
|
KERNEL_PARAM_TYPE kernelParams[] = { CUDA_KERNEL_PARAM(bpBuff) };
|
|
|
|
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_INITIALIZE_RANKS, PxgBPKernelGridDim::BP_INITIALIZE_RANKS, 1, 1, PxgBPKernelBlockDim::BP_INITIALIZE_RANKS, 1, 1, 0, EPILOG);
|
|
|
|
runRadixSort(nbProjections, mRadixSortDescBuf.getDevicePtr());
|
|
|
|
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_UDPATE_HANDLES, PxgBPKernelGridDim::BP_UDPATE_HANDLES, 1, 1, PxgBPKernelBlockDim::BP_UDPATE_HANDLES, 1, 1, 0, EPILOG);
|
|
}
|
|
|
|
//sort projections and handles without ranks
|
|
void PxgCudaBroadPhaseSap::sortProjectionAndHandlesWORKernel(PxU32 previousNumOfBoxes)
|
|
{
|
|
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.sortProjectionAndHandlesWORKernel", mContextID);
|
|
|
|
const PxU32 numHandles = previousNumOfBoxes + mUpdateData_CreatedHandleSize;
|
|
//const PxU32 numHandles = mPreviousNumOfBoxes + mUpdateData_CreatedHandleSize;
|
|
|
|
if(numHandles == 0)
|
|
return;
|
|
|
|
PxU32 nbProjections = numHandles*2;
|
|
|
|
//we need to pad the number of projection to the multiply of 4
|
|
nbProjections = (nbProjections + 3) & (~3);
|
|
|
|
runRadixSort(nbProjections, mRadixSortWORDescBuf.getDevicePtr());
|
|
GPU_DEBUG_STREAM(mStream, "GPU radix sort fail!!\n")
|
|
}
|
|
|
|
|
|
// PT:
|
|
// In:
|
|
// bpDesc->boxHandles
|
|
// Out:
|
|
// bpDesc->boxNewSapBox1D or bpDesc->boxSapBox1D
|
|
void PxgCudaBroadPhaseSap::initializeSapBoxKernel(const PxU32 numHandles, bool isNew)
|
|
{
|
|
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.initializeSapBoxKernel", mContextID);
|
|
|
|
const PxU32 nbBlocks = ((numHandles*2) + PxgBPKernelBlockDim::BP_INITIALIZE_SAPBOX-1)/ PxgBPKernelBlockDim::BP_INITIALIZE_SAPBOX;
|
|
if(nbBlocks)
|
|
{
|
|
CUdeviceptr bpDescd = mBPDescBuf.getDevicePtr();
|
|
KERNEL_PARAM_TYPE kernelParams[] = { CUDA_KERNEL_PARAM(bpDescd), CUDA_KERNEL_PARAM(numHandles), CUDA_KERNEL_PARAM(isNew) };
|
|
|
|
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_INITIALIZE_SAPBOX, nbBlocks, 1, 1, PxgBPKernelBlockDim::BP_INITIALIZE_SAPBOX, 1, 1, 0, EPILOG);
|
|
}
|
|
}
|
|
|
|
void PxgCudaBroadPhaseSap::translateAABBsKernel()
|
|
{
|
|
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.translateAABBsKernel", mContextID);
|
|
|
|
if(mUpdateData_BoxesCapacity == 0)
|
|
return;
|
|
|
|
const PxBounds3* updateData_fpBounds = reinterpret_cast<PxBounds3*>(mBoxFpBoundsBuf.getDevicePtr());
|
|
PxgIntegerAABB* newIntegerBounds = reinterpret_cast<PxgIntegerAABB*>(mNewIntegerBoundsBuf.getDevicePtr());
|
|
const PxReal* updateData_contactDistances = reinterpret_cast<PxReal*>(mBoxContactDistancesBuf.getDevicePtr());
|
|
const PxU32* updateData_envIDs = reinterpret_cast<PxU32*>(mBoxEnvIDsBuf.getDevicePtr());
|
|
|
|
KERNEL_PARAM_TYPE kernelParams[] = {
|
|
CUDA_KERNEL_PARAM(updateData_fpBounds),
|
|
CUDA_KERNEL_PARAM(newIntegerBounds),
|
|
CUDA_KERNEL_PARAM(updateData_contactDistances),
|
|
CUDA_KERNEL_PARAM(updateData_envIDs),
|
|
CUDA_KERNEL_PARAM(mUpdateData_BoxesCapacity),
|
|
CUDA_KERNEL_PARAM(mDesc.gpuBroadPhaseNbBitsShiftX),
|
|
CUDA_KERNEL_PARAM(mDesc.gpuBroadPhaseNbBitsShiftY),
|
|
CUDA_KERNEL_PARAM(mDesc.gpuBroadPhaseNbBitsShiftZ),
|
|
CUDA_KERNEL_PARAM(mDesc.gpuBroadPhaseNbBitsEnvIDX),
|
|
CUDA_KERNEL_PARAM(mDesc.gpuBroadPhaseNbBitsEnvIDY),
|
|
CUDA_KERNEL_PARAM(mDesc.gpuBroadPhaseNbBitsEnvIDZ)
|
|
};
|
|
|
|
const PxU32 aabbsPerBlock = PxgBPKernelBlockDim::BP_TRANSLATE_AABBS/8;
|
|
const PxU32 nbBlocks = (mUpdateData_BoxesCapacity + aabbsPerBlock-1)/aabbsPerBlock; // PT: do we really need mBoxesCapacity here?
|
|
|
|
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_TRANSLATE_AABBS, nbBlocks, 1, 1, PxgBPKernelBlockDim::BP_TRANSLATE_AABBS, 1, 1, 0, EPILOG);
|
|
}
|
|
|
|
void PxgCudaBroadPhaseSap::markRemovedPairsKernel()
|
|
{
|
|
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.markRemovedPairsKernel", mContextID);
|
|
|
|
if(mUpdateData_RemovedHandleSize)
|
|
{
|
|
CUdeviceptr bpDescd = mBPDescBuf.getDevicePtr();
|
|
KERNEL_PARAM_TYPE kernelParams[] = { CUDA_KERNEL_PARAM(bpDescd) };
|
|
|
|
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_MARK_DELETEDPAIRS, PxgBPKernelGridDim::BP_UPDATE_DELETEDPAIRS, 1, 1, PxgBPKernelBlockDim::BP_UPDATE_DELETEDPAIRS, 1, 1, 0, EPILOG);
|
|
}
|
|
}
|
|
|
|
void PxgCudaBroadPhaseSap::markRemovedPairsProjectionsKernel()
|
|
{
|
|
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.markRemovedPairsProjectionsKernel", mContextID);
|
|
|
|
if(mUpdateData_RemovedHandleSize)
|
|
{
|
|
CUdeviceptr bpDescd = mBPDescBuf.getDevicePtr();
|
|
KERNEL_PARAM_TYPE kernelParams[] = { CUDA_KERNEL_PARAM(bpDescd) };
|
|
|
|
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_UPDATE_DELETEDPAIRS, PxgBPKernelGridDim::BP_UPDATE_DELETEDPAIRS, 1, 1, PxgBPKernelBlockDim::BP_UPDATE_DELETEDPAIRS, 1, 1, 0, EPILOG);
|
|
}
|
|
}
|
|
|
|
void PxgCudaBroadPhaseSap::markUpdatedPairsKernel()
|
|
{
|
|
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.markUpdatedPairsKernel", mContextID);
|
|
|
|
//if(mUpdatedHandleSize != 0) // PT: TODO: why was this removed? ==> probably because the GPU code started reading from the AABB manager bitmap directly (which created the "evil coupling" we found before)
|
|
{
|
|
CUdeviceptr bpDescd = mBPDescBuf.getDevicePtr();
|
|
KERNEL_PARAM_TYPE kernelParams[] = { CUDA_KERNEL_PARAM(bpDescd) };
|
|
|
|
#ifdef SUPPORT_UPDATE_HANDLES_ARRAY_FOR_GPU
|
|
// PT: we need a new kernel to use this as a standalone BP and break the coupling between this class and the GPU AABB manager
|
|
if(mBpDesc->updateData_updatedHandles)
|
|
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_UPDATE_UPDATEDPAIRS2, PxgBPKernelGridDim::BP_UPDATE_UPDATEDPAIRS2, 1, 1, PxgBPKernelBlockDim::BP_UPDATE_UPDATEDPAIRS2, 1, 1, 0, EPILOG);
|
|
else
|
|
#endif
|
|
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_UPDATE_UPDATEDPAIRS, PxgBPKernelGridDim::BP_UPDATE_UPDATEDPAIRS, 1, 1, PxgBPKernelBlockDim::BP_UPDATE_UPDATEDPAIRS, 1, 1, 0, EPILOG);
|
|
}
|
|
}
|
|
|
|
// PT:
|
|
// In:
|
|
// bpDesc->numCreatedHandles
|
|
// bpDesc->numPreviousHandles
|
|
// bpDesc->updateData_createdHandles
|
|
// bpDesc->newIntegerBounds
|
|
// Out:
|
|
// bpDesc->boxProjections // copy of newIntegerBounds but split between X/Y/Z axes
|
|
// bpDesc->boxHandles // see createHandle(), will have link to CPU index & some flags
|
|
// bpDesc->oldIntegerBounds // Kernel will set old bounds of new objects to empty
|
|
//
|
|
// boxProjections & boxHandles are parallel arrays indexed by the GPU index
|
|
void PxgCudaBroadPhaseSap::markCreatedPairsKernel()
|
|
{
|
|
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.markCreatedPairsKernel", mContextID);
|
|
|
|
if(mUpdateData_CreatedHandleSize)
|
|
{
|
|
CUdeviceptr bpDescd = mBPDescBuf.getDevicePtr();
|
|
KERNEL_PARAM_TYPE kernelParams[] = { CUDA_KERNEL_PARAM(bpDescd) };
|
|
|
|
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_UPDATE_CREATEDPAIRS, PxgBPKernelGridDim::BP_UPDATE_CREATEDPAIRS, 1, 1, PxgBPKernelBlockDim::BP_UPDATE_CREATEDPAIRS, 1, 1, 0, EPILOG);
|
|
}
|
|
}
|
|
|
|
void PxgCudaBroadPhaseSap::calculateEndPtHistogramKernel(const bool isIncremental)
|
|
{
|
|
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.calculateEndPtHistogramKernel", mContextID);
|
|
|
|
CUdeviceptr bpDescd = mBPDescBuf.getDevicePtr();
|
|
KERNEL_PARAM_TYPE kernelParams[] = { CUDA_KERNEL_PARAM(bpDescd), CUDA_KERNEL_PARAM(isIncremental) };
|
|
|
|
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_COMPUTE_ENDPT_HISTOGRAM, PxgBPKernelGridDim::BP_COMPUTE_ENDPT_HISTOGRAM, 3, 1, PxgBPKernelBlockDim::BP_COMPUTE_ENDPT_HISTOGRAM, 1, 1, 0, EPILOG);
|
|
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_OUTPUT_ENDPT_HISTOGRAM, PxgBPKernelGridDim::BP_OUTPUT_ENDPT_HISTOGRAM, 3, 1, PxgBPKernelBlockDim::BP_OUTPUT_ENDPT_HISTOGRAM, 1, 1, 0, EPILOG);
|
|
}
|
|
|
|
void PxgCudaBroadPhaseSap::computeRegionHistogramKernel()
|
|
{
|
|
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.computeRegionHistogramKernel", mContextID);
|
|
|
|
if(mUpdateData_CreatedHandleSize)
|
|
{
|
|
const PxU32 nbProjections = (mNumOfBoxes + mUpdateData_RemovedHandleSize) * 2;
|
|
const PxU32 totalNbProjectionRegions = (nbProjections*64 + 3)&(~3);
|
|
|
|
//zero regions
|
|
mCudaContext->memsetD32Async(mActiveRegionTotalBuf.getDevicePtr(), 0, totalNbProjectionRegions, mStream);
|
|
mCudaContext->memsetD32Async(mStartRegionsTotalBuf.getDevicePtr(), 0, totalNbProjectionRegions, mStream);
|
|
|
|
CUdeviceptr bpDescd = mBPDescBuf.getDevicePtr();
|
|
KERNEL_PARAM_TYPE kernelParams[] = { CUDA_KERNEL_PARAM(bpDescd) };
|
|
|
|
//create regions
|
|
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_CREATE_REGIONS, PxgBPKernelGridDim::BP_CREATE_REGIONS, 1, 1, PxgBPKernelBlockDim::BP_CREATE_REGIONS, 1, 1, 0, EPILOG);
|
|
|
|
//compute start region histogram inside a block
|
|
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_COMPUTE_START_REGION_HISTOGRAM, PxgBPKernelGridDim::BP_COMPUTE_START_REGION_HISTOGRAM, 1, 1, PxgBPKernelBlockDim::BP_COMPUTE_START_REGION_HISTOGRAM, 1, 1, 0, EPILOG);
|
|
|
|
//compute start region histogram between blocks
|
|
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_OUTPUT_START_REGION_HISTOGRAM, PxgBPKernelGridDim::BP_OUTPUT_START_REGION_HISTOGRAM, 1, 1, PxgBPKernelBlockDim::BP_OUTPUT_START_REGION_HISTOGRAM, 1, 1, 0, EPILOG);
|
|
|
|
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_COMPUTE_REGION_HISTOGRAM, PxgBPKernelGridDim::BP_COMPUTE_REGION_HISTOGRAM, 1, 1, PxgBPKernelBlockDim::BP_COMPUTE_REGION_HISTOGRAM, 1, 1, 0, EPILOG);
|
|
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_OUTPUT_REGION_HISTOGRAM, PxgBPKernelGridDim::BP_OUTPUT_REGION_HISTOGRAM, 1, 1, PxgBPKernelBlockDim::BP_OUTPUT_REGION_HISTOGRAM, 1, 1, 0, EPILOG);
|
|
}
|
|
}
|
|
|
|
void PxgCudaBroadPhaseSap::computeStartAndActiveHistogramKernel()
|
|
{
|
|
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.computeStartAndActiveHistogramKernel", mContextID);
|
|
|
|
if(mUpdateData_CreatedHandleSize)
|
|
{
|
|
CUdeviceptr bpDescd = mBPDescBuf.getDevicePtr();
|
|
KERNEL_PARAM_TYPE kernelParams[] = { CUDA_KERNEL_PARAM(bpDescd) };
|
|
|
|
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_WRITEOUT_ACTIVE_HISTOGRAM, PxgBPKernelGridDim::BP_WRITEOUT_ACTIVE_HISTOGRAM, 1, 1, PxgBPKernelBlockDim::BP_WRITEOUT_ACTIVE_HISTOGRAM, 1, 1, 0, EPILOG);
|
|
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_COMPUTE_ACTIVE_HISTOGRAM, PxgBPKernelGridDim::BP_COMPUTE_ACTIVE_HISTOGRAM, 1, 1, PxgBPKernelBlockDim::BP_COMPUTE_ACTIVE_HISTOGRAM, 1, 1, 0, EPILOG);
|
|
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_OUTPUT_ACTIVE_HISTOGRAM, PxgBPKernelGridDim::BP_OUTPUT_ACTIVE_HISTOGRAM, 1, 1, PxgBPKernelBlockDim::BP_OUTPUT_ACTIVE_HISTOGRAM, 1, 1, 0, EPILOG);
|
|
}
|
|
}
|
|
|
|
void PxgCudaBroadPhaseSap::performIncrementalSapKernel()
|
|
{
|
|
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.performIncrementalSapKernel", mContextID);
|
|
|
|
//if(mUpdatedHandleSize != 0) // PT: TODO: why was this removed?
|
|
{
|
|
CUdeviceptr bpDescd = mBPDescBuf.getDevicePtr();
|
|
KERNEL_PARAM_TYPE kernelParams[] = { CUDA_KERNEL_PARAM(bpDescd) };
|
|
|
|
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_COMPUTE_INCREMENTAL_CMP_COUNTS1, PxgBPKernelGridDim::BP_COMPUTE_INCREMENTAL_CMP_COUNTS1, 1, 1, PxgBPKernelBlockDim::BP_COMPUTE_INCREMENTAL_CMP_COUNTS1, 1, 1, 0, EPILOG);
|
|
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_COMPUTE_INCREMENTAL_CMP_COUNTS2, PxgBPKernelGridDim::BP_COMPUTE_INCREMENTAL_CMP_COUNTS2, 1, 1, PxgBPKernelBlockDim::BP_COMPUTE_INCREMENTAL_CMP_COUNTS2, 1, 1, 0, EPILOG);
|
|
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_INCREMENTAL_SAP, PxgBPKernelGridDim::BP_INCREMENTAL_SAP, 3, 1, PxgBPKernelBlockDim::BP_INCREMENTAL_SAP, 1, 1, 0, EPILOG);
|
|
}
|
|
}
|
|
|
|
void PxgCudaBroadPhaseSap::generateNewPairsKernel()
|
|
{
|
|
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.generateNewPairsKernel", mContextID);
|
|
|
|
if(mUpdateData_CreatedHandleSize)
|
|
{
|
|
//Need to generate pairs for created handles...
|
|
CUdeviceptr bpDescd = mBPDescBuf.getDevicePtr();
|
|
KERNEL_PARAM_TYPE kernelParams[] = { CUDA_KERNEL_PARAM(bpDescd) };
|
|
|
|
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_WRITEOUT_OVERLAPCHECKS_HISTOGRAM_NEWBOUNDS, PxgBPKernelGridDim::BP_WRITEOUT_OVERLAPCHECKS_HISTOGRAM_NEWBOUNDS, 1, 1, PxgBPKernelBlockDim::BP_WRITEOUT_OVERLAPCHECKS_HISTOGRAM_NEWBOUNDS, 1, 1, 0, EPILOG);
|
|
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_COMPUTE_OVERLAPCHECKS_HISTOGRAM, PxgBPKernelGridDim::BP_COMPUTE_OVERLAPCHECKS_HISTOGRAM, 1, 1, PxgBPKernelBlockDim::BP_COMPUTE_OVERLAPCHECKS_HISTOGRAM, 1, 1, 0, EPILOG);
|
|
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_OUTPUT_OVERLAPCHECKS_HISTOGRAM, PxgBPKernelGridDim::BP_OUTPUT_OVERLAPCHECKS_HISTOGRAM, 1, 1, PxgBPKernelBlockDim::BP_OUTPUT_OVERLAPCHECKS_HISTOGRAM, 1, 1, 0, EPILOG);
|
|
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_GENERATE_FOUNDPAIR_NEWBOUNDS, PxgBPKernelGridDim::BP_GENERATE_FOUNDPAIR_NEWBOUNDS, 1, 1, PxgBPKernelBlockDim::BP_GENERATE_FOUNDPAIR_NEWBOUNDS, 1, 1, 0, EPILOG);
|
|
}
|
|
|
|
GPU_DEBUG_STREAM(mStream, "GPU generate new pairs fail!!\n")
|
|
}
|
|
|
|
void PxgCudaBroadPhaseSap::clearNewFlagKernel()
|
|
{
|
|
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.clearNewFlagKernel", mContextID);
|
|
|
|
if(mUpdateData_CreatedHandleSize)
|
|
{
|
|
CUdeviceptr bpDescd = mBPDescBuf.getDevicePtr();
|
|
KERNEL_PARAM_TYPE kernelParams[] = { CUDA_KERNEL_PARAM(bpDescd) };
|
|
|
|
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_CLEAR_NEWFLAG, PxgBPKernelGridDim::BP_CLEAR_NEWFLAG, 1, 1, PxgBPKernelBlockDim::BP_CLEAR_NEWFLAG, 1, 1, 0, EPILOG);
|
|
}
|
|
}
|
|
|
|
void PxgCudaBroadPhaseSap::updateRadixSortDesc(PxgRadixSortDesc* rsDescs)
|
|
{
|
|
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.updateRadixSortDesc", mContextID);
|
|
|
|
for (PxU32 i = 0; i < 3; ++i)
|
|
{
|
|
PxU32 offIndex = i+3;
|
|
CUdeviceptr inputKeyd = mBoxPtProjectionsBuf[i].getDevicePtr();
|
|
CUdeviceptr inputRankd = mBoxProjectionRanksBuf[i].getDevicePtr();
|
|
CUdeviceptr outputKeyd = mTempBoxPtProjectionBuf[i].getDevicePtr();
|
|
CUdeviceptr outputRankd = mTempBoxPtHandlesBuf[i].getDevicePtr();
|
|
CUdeviceptr radixCountd = mRadixCountBuf[i].getDevicePtr();
|
|
|
|
rsDescs[i].inputKeys = reinterpret_cast<PxU32*>(inputKeyd);
|
|
rsDescs[i].inputRanks = reinterpret_cast<PxU32*>(inputRankd);
|
|
rsDescs[i].outputKeys = reinterpret_cast<PxU32*>(outputKeyd);
|
|
rsDescs[i].outputRanks = reinterpret_cast<PxU32*>(outputRankd);
|
|
rsDescs[i].radixBlockCounts = reinterpret_cast<PxU32*>(radixCountd);
|
|
|
|
rsDescs[offIndex].outputKeys = reinterpret_cast<PxU32*>(inputKeyd);
|
|
rsDescs[offIndex].outputRanks = reinterpret_cast<PxU32*>(inputRankd);
|
|
rsDescs[offIndex].inputKeys = reinterpret_cast<PxU32*>(outputKeyd);
|
|
rsDescs[offIndex].inputRanks = reinterpret_cast<PxU32*>(outputRankd);
|
|
rsDescs[offIndex].radixBlockCounts = reinterpret_cast<PxU32*>(radixCountd);
|
|
|
|
CUdeviceptr inputVald = mBoxPtHandlesBuf[i].getDevicePtr();
|
|
|
|
mRSDescWOR[i].inputKeys = reinterpret_cast<PxU32*>(inputKeyd);
|
|
mRSDescWOR[i].inputRanks = reinterpret_cast<PxU32*>(inputVald);
|
|
mRSDescWOR[i].outputKeys = reinterpret_cast<PxU32*>(outputKeyd);
|
|
mRSDescWOR[i].outputRanks = reinterpret_cast<PxU32*>(outputRankd);
|
|
mRSDescWOR[i].radixBlockCounts = reinterpret_cast<PxU32*>(radixCountd);
|
|
|
|
mRSDescWOR[offIndex].outputKeys = reinterpret_cast<PxU32*>(inputKeyd);
|
|
mRSDescWOR[offIndex].outputRanks = reinterpret_cast<PxU32*>(inputVald);
|
|
mRSDescWOR[offIndex].inputKeys = reinterpret_cast<PxU32*>(outputKeyd);
|
|
mRSDescWOR[offIndex].inputRanks = reinterpret_cast<PxU32*>(outputRankd);
|
|
mRSDescWOR[offIndex].radixBlockCounts = reinterpret_cast<PxU32*>(radixCountd);
|
|
}
|
|
}
|
|
|
|
void PxgCudaBroadPhaseSap::updateDescriptor(PxgBroadPhaseDesc& desc)
|
|
{
|
|
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.updateDescriptor", mContextID);
|
|
|
|
// PT: there's some evil coupling between the BP and mAABBManager here. The CUDA code does use
|
|
// these buffers (e.g. volumeData) so effectively the CUDA BP cannot be used alone without the Pxg AABB manager.
|
|
|
|
// PT: added this here
|
|
PxMemZero(&desc, sizeof(PxgBroadPhaseDesc));
|
|
|
|
const PxU32 previousBoxes = mNumOfBoxes + mUpdateData_RemovedHandleSize - mUpdateData_CreatedHandleSize;
|
|
|
|
desc.updateData_createdHandles = reinterpret_cast<PxU32*>(mCreatedHandlesBuf.getDevicePtr());
|
|
desc.numCreatedHandles = mUpdateData_CreatedHandleSize;
|
|
desc.updateData_removedHandles = reinterpret_cast<PxU32*>(mRemovedHandlesBuf.getDevicePtr());
|
|
desc.numRemovedHandles = mUpdateData_RemovedHandleSize;
|
|
|
|
// PT: TODO: replace with adapter? I think this won't work without the bitmaps anyway?
|
|
if(mAABBManager)
|
|
{
|
|
// PT: this data is used in:
|
|
// - markUpdatedPairsLaunch (BP_UPDATE_UPDATEDPAIRS)
|
|
{
|
|
desc.aabbMngr_changedHandleMap = reinterpret_cast<PxU32*>(mAABBManager->getChangedAABBMgrHandles());
|
|
desc.aabbMngr_changedHandleBitMapWordCounts = mAABBManager->getChangedAABBMgActorHandleMap().getWordCount();
|
|
desc.aabbMngr_addedHandleMap = reinterpret_cast<PxU32*>(mAABBManager->getAddedHandles());
|
|
desc.aabbMngr_removedHandleMap = reinterpret_cast<PxU32*>(mAABBManager->getRemovedHandles());
|
|
desc.aabbMngr_aggregatedBoundHandles = reinterpret_cast<PxU32*>(mAABBManager->getAggregatedBounds());
|
|
}
|
|
|
|
// PT: this data is used in:
|
|
// - doAggPairCollisions (AGG_PAIR_COLLISION)
|
|
// - accumulateReportsStage_1 (BP_ACCUMULATE_REPORT_STAGE_1)
|
|
// - accumulateReportsStage_2 (BP_ACCUMULATE_REPORT_STAGE_2)
|
|
desc.aabbMngr_volumeData = reinterpret_cast<Bp::VolumeData*>(mAABBManager->mVolumDataBuf.getDevicePtr());
|
|
}
|
|
#ifdef SUPPORT_UPDATE_HANDLES_ARRAY_FOR_GPU
|
|
else
|
|
{
|
|
// PT: the GPU AABB manager never passes updated handles, the list is always empty! In this codepath
|
|
// (used for standalone BPs) we make it work again with an array of updated handles.
|
|
desc.updateData_updatedHandles = reinterpret_cast<PxU32*>(mUpdatedHandlesBuf.getDevicePtr());
|
|
desc.numUpdatedHandles = mUpdateData_UpdatedHandleSize;
|
|
//printf("%d\n", desc.numUpdatedHandles);
|
|
}
|
|
#endif
|
|
|
|
desc.oldIntegerBounds = reinterpret_cast<PxgIntegerAABB*>(mOldIntegerBoundsBuf.getDevicePtr());
|
|
desc.newIntegerBounds = reinterpret_cast<PxgIntegerAABB*>(mNewIntegerBoundsBuf.getDevicePtr());
|
|
desc.updateData_fpBounds = reinterpret_cast<PxBounds3*>(mBoxFpBoundsBuf.getDevicePtr());
|
|
desc.updateData_contactDistances = reinterpret_cast<PxReal*>(mBoxContactDistancesBuf.getDevicePtr());
|
|
desc.updateData_groups = reinterpret_cast<PxU32*>(mBoxGroupsBuf.getDevicePtr());
|
|
desc.updateData_envIDs = reinterpret_cast<PxU32*>(mBoxEnvIDsBuf.getDevicePtr());
|
|
|
|
desc.numPreviousHandles = previousBoxes;
|
|
//desc.numHandles = mNumOfBoxes;
|
|
desc.foundPairReport = reinterpret_cast<PxgBroadPhasePair*>(mFoundPairsBuf.getDevicePtr());
|
|
desc.lostPairReport = reinterpret_cast<PxgBroadPhasePair*>(mLostPairsBuf.getDevicePtr());
|
|
|
|
desc.foundAggPairReport = reinterpret_cast<PxgBroadPhasePair*>(mFoundAggregateBuf.getDevicePtr());
|
|
desc.lostAggPairReport = reinterpret_cast<PxgBroadPhasePair*>(mLostAggregateBuf.getDevicePtr());
|
|
|
|
desc.foundActorPairReport = reinterpret_cast<PxgBroadPhasePair*>(mFoundActorBuf.getDevicePtr());
|
|
desc.lostActorPairReport = reinterpret_cast<PxgBroadPhasePair*>(mLostActorBuf.getDevicePtr());
|
|
|
|
desc.foundPairReportMap = reinterpret_cast<PxgBroadPhasePair*>(getMappedDevicePtr(mCudaContext, mFoundActorPairs.begin()));
|
|
desc.lostPairReportMap = reinterpret_cast<PxgBroadPhasePair*>(getMappedDevicePtr(mCudaContext, mLostActorPairs.begin()));
|
|
|
|
for (PxU32 i = 0; i < 3; ++i)
|
|
{
|
|
/*const PxU32 offset = i*nbProjections;
|
|
desc.boxProjectionRanks[i] = projRanksGpuPtr + offset;*/
|
|
|
|
desc.boxSapBox1D[i] = reinterpret_cast<PxgSapBox1D*>(mBoxSapBox1DBuf[i].getDevicePtr());
|
|
desc.boxNewSapBox1D[i] = reinterpret_cast<PxgSapBox1D*>(mNewBoxSapBox1DBuf[i].getDevicePtr());
|
|
desc.boxProjectionRanks[i] = reinterpret_cast<PxU32*>(mBoxProjectionRanksBuf[i].getDevicePtr());
|
|
desc.boxProjections[i] = reinterpret_cast<PxU32*>(mBoxPtProjectionsBuf[i].getDevicePtr());
|
|
desc.boxHandles[0][i] = reinterpret_cast<PxU32*>(mBoxPtHandlesBuf[i].getDevicePtr());
|
|
desc.boxHandles[1][i] = reinterpret_cast<PxU32*>(mBoxPtHandlesBuf[i+3].getDevicePtr());
|
|
|
|
{
|
|
desc.totalEndPtHistogram[i] = reinterpret_cast<PxU32*>(mTotalEndPtHistogramBuf[i].getDevicePtr());
|
|
desc.blockTotalEndPtHistogram[i] = reinterpret_cast<PxU32*>(mBlockTotalEndPtHistogramBuf[i].getDevicePtr());
|
|
|
|
for (PxU32 j = 0; j < 2; ++j)
|
|
{
|
|
//desc.boxHandles[j][i] = handleGpuPtr[j] + i*projectionCount[j];
|
|
const PxU32 index = j * 3 + i;
|
|
desc.endPtHistogram[j][i] = reinterpret_cast<PxU32*>(mEndPtHistogramBuf[index].getDevicePtr());
|
|
desc.blockEndPtHistogram[j][i] = reinterpret_cast<PxU32*>(mBlockEndPtHistogramBuf[index].getDevicePtr());
|
|
desc.startPtHistogram[j][i] = reinterpret_cast<PxU32*>(mStartPtHistogramBuf[index].getDevicePtr());
|
|
desc.blockStartPtHistogram[j][i] = reinterpret_cast<PxU32*>(mBlockStartPtHistogramBuf[index].getDevicePtr());
|
|
|
|
desc.endPointHandles[j][i] = reinterpret_cast<PxU32*>(mEndPtHandleBuf[index].getDevicePtr());
|
|
desc.startPointHandles[j][i] = reinterpret_cast<PxU32*>(mStartPtHandleBuf[index].getDevicePtr());
|
|
}
|
|
|
|
desc.incrementalComparisons[i] = reinterpret_cast<PxU32*>(mIncrementalComparisons[i].getDevicePtr());
|
|
desc.incrementalBlockComparisons[i] = reinterpret_cast<PxU32*>(mIncrementalBlockComparisons[i].getDevicePtr());
|
|
}
|
|
}
|
|
|
|
for (PxU32 i = 0; i < 2; ++i)
|
|
{
|
|
desc.aggReportBlock[i] = reinterpret_cast<PxU32*>(mAggregateReportBlockBuf[i].getDevicePtr());
|
|
desc.actorReportBlock[i] = reinterpret_cast<PxU32*>(mActorReportBlockBuf[i].getDevicePtr());
|
|
}
|
|
|
|
desc.activeRegionsHistogram = reinterpret_cast<PxU32*>(mActiveRegionTotalBuf.getDevicePtr());
|
|
desc.startRegionsHistogram = reinterpret_cast<PxU32*>(mStartRegionsTotalBuf.getDevicePtr());
|
|
desc.orderedActiveRegionHandles = reinterpret_cast<PxU32*>(mOrderedActiveRegionHandlesTotalBuf.getDevicePtr());
|
|
desc.orderedStartRegionHandles = reinterpret_cast<PxU32*>(mOrderedStartRegionHandlesTotalBuf.getDevicePtr());
|
|
|
|
desc.blockOverlapChecksRegion = reinterpret_cast<regionOverlapType*>(mBlockOverlapChecksRegionBuf.getDevicePtr());
|
|
desc.overlapChecksRegion = reinterpret_cast<regionOverlapType*>(mOverlapChecksRegionBuf.getDevicePtr());
|
|
desc.overlapChecksHandleRegiones = reinterpret_cast<PxgHandleRegion*>(mOverlapChecksHandleRegionBuf.getDevicePtr());
|
|
desc.regionRange = reinterpret_cast<PxgIntegerRegion*>(mRegionRangeBuf.getDevicePtr());
|
|
desc.startRegionAccum = reinterpret_cast<PxU32*>(mStartRegionAccumBuf.getDevicePtr());
|
|
desc.blockStartRegionAccum = reinterpret_cast<PxU32*>(mBlockStartRegionAccumBuf.getDevicePtr());
|
|
desc.regionAccum = reinterpret_cast<PxU32*>(mRegionAccumBuf.getDevicePtr());
|
|
desc.blockRegionAccum = reinterpret_cast<PxU32*>(mBlockRegionAccumBuf.getDevicePtr());
|
|
|
|
desc.sharedFoundPairIndex = 0;
|
|
desc.sharedLostPairIndex = 0;
|
|
desc.sharedFoundAggPairIndex = 0;
|
|
desc.sharedLostAggPairIndex = 0;
|
|
|
|
desc.startRegionAccumTotal = 0;
|
|
|
|
desc.regionAccumTotal = mRegionAccumTotal;
|
|
desc.overlapChecksTotalRegion = mOverlapChecksTotalRegion;
|
|
desc.max_found_lost_pairs = mMaxFoundLostPairs;
|
|
desc.found_lost_pairs_overflow_flags = false;
|
|
|
|
desc.max_found_lost_agg_pairs = mMaxAggFoundLostPairs;
|
|
}
|
|
|
|
void PxgCudaBroadPhaseSap::update(PxcScratchAllocator* /*scratchAllocator*/, const Bp::BroadPhaseUpdateData& updateData, PxBaseTask* /*continuation*/)
|
|
{
|
|
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.update", mContextID);
|
|
|
|
// PT: TODO: this function is now the only place left using getGpuStateChanged() and getStateChanged()
|
|
// PT: TODO: could we move this outside of the update call to sever this last connection?
|
|
*mPinnedEvent = 0;
|
|
|
|
PxScopedCudaLock _lock_(*mCudaContextManager);
|
|
|
|
const PxU32 previousNumOfBoxes = mNumOfBoxes;
|
|
gpuDMAUp(updateData, *mBpDesc, mRSDesc);
|
|
|
|
const bool gpuStateChanged = updateData.getGpuStateChanged();
|
|
bool forcedUpdate = false;
|
|
|
|
if((updateData.getNumCreatedHandles() + updateData.getNumRemovedHandles()) == 0 && !updateData.getStateChanged() && !gpuStateChanged)
|
|
{
|
|
if (mForceUpdate)
|
|
{
|
|
forcedUpdate = true;
|
|
//We force a single update after everything has gone to sleep to force through some
|
|
//properties like the double-buffered bounds swap. If we don't do this, then some of the
|
|
//GPU aggregate logic can fail.
|
|
}
|
|
else
|
|
return;
|
|
}
|
|
|
|
mForceUpdate = !forcedUpdate;
|
|
|
|
// PT: not all kernels are needed for all cases (added / updated / removed).
|
|
// For an easier time analysing the code, it can be good to only trace the code needed for
|
|
// "one shot queries", ignoring the bits needed for updated & removed objects.
|
|
const bool oneShotQuery = false;
|
|
|
|
// translate from FP bounds to integer bounds
|
|
translateAABBsKernel();
|
|
|
|
if(!oneShotQuery)
|
|
{
|
|
//we mark pairs as removed but we didn't change their projections in the previous frame's box handles
|
|
markRemovedPairsKernel();
|
|
|
|
//we mark pairs as updated(update projections) in the current frame's box handles
|
|
if(gpuStateChanged)
|
|
markUpdatedPairsKernel();
|
|
|
|
//sort projections in the current frame and produce ranks which is used in the incremental sap to simulate swap.
|
|
//Also, we need to use the ranks to update the current frame's box handle
|
|
sortProjectionAndHandlesWRKernel(previousNumOfBoxes);
|
|
|
|
//histogram for projections end pointsm
|
|
calculateEndPtHistogramKernel(true);
|
|
|
|
//perform incremental sap to produce pairs for updated pairs and lost pairs
|
|
if(gpuStateChanged)
|
|
performIncrementalSapKernel();
|
|
}
|
|
|
|
if(mUpdateData_RemovedHandleSize)
|
|
{
|
|
//we need to recalculate the sap box because we resort the projections and handles based on the updated pairs
|
|
initializeSapBoxKernel(previousNumOfBoxes, false);
|
|
|
|
//we need to reset the projections for the removed pairs so that they can be shuffled to the end of the array after sort
|
|
markRemovedPairsProjectionsKernel();
|
|
}
|
|
|
|
if(mUpdateData_CreatedHandleSize || mUpdateData_RemovedHandleSize)
|
|
{
|
|
markCreatedPairsKernel();
|
|
|
|
sortProjectionAndHandlesWORKernel(previousNumOfBoxes);
|
|
|
|
calculateEndPtHistogramKernel(false);
|
|
}
|
|
|
|
initializeSapBoxKernel(mNumOfBoxes, false);
|
|
|
|
computeRegionHistogramKernel();
|
|
|
|
computeStartAndActiveHistogramKernel();
|
|
|
|
generateNewPairsKernel();
|
|
|
|
if(!oneShotQuery)
|
|
PxgCudaBuffer::swapBuffer(mNewIntegerBoundsBuf, mOldIntegerBoundsBuf);
|
|
|
|
clearNewFlagKernel();
|
|
|
|
runCopyResultsKernel(*mBpDesc);
|
|
|
|
//mCudaContext->streamFlush(mStream);
|
|
}
|
|
|
|
// PT: called from PxgAABBManager::preBpUpdate_GPU
|
|
void PxgCudaBroadPhaseSap::preBroadPhase(const Bp::BroadPhaseUpdateData& updateData)
|
|
{
|
|
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.preBroadPhase", mContextID);
|
|
|
|
PxScopedCudaLock _lock_(*mCudaContextManager);
|
|
|
|
//mPreviousNumOfBoxes = mNumOfBoxes;
|
|
//gpuDMAUp(updateData, *mBpDesc, mRSDesc);
|
|
|
|
// PT: the code below used to be in "gpuDmaUpSharedData"
|
|
|
|
const PxU32 capacity = updateData.getCapacity();
|
|
mUpdateData_BoxesCapacity = capacity;
|
|
|
|
const PxU32 boundsSize = capacity * sizeof(PxBounds3);
|
|
const PxU32 distanceSize = capacity * sizeof(PxReal);
|
|
const PxU32 groupSize = capacity * sizeof(PxU32);
|
|
const PxU32 envIDSize = capacity * sizeof(PxU32);
|
|
|
|
mBoxContactDistancesBuf.allocate(distanceSize, PX_FL);
|
|
mBoxGroupsBuf.allocate(groupSize, PX_FL);
|
|
if(updateData.getEnvIDs())
|
|
mBoxEnvIDsBuf.allocate(envIDSize, PX_FL);
|
|
|
|
if(updateData.getStateChanged()) // PT: otherwise the call should have been skipped
|
|
{
|
|
mBoxFpBoundsBuf.allocate(boundsSize, PX_FL);
|
|
mCudaContext->memcpyHtoDAsync(mBoxFpBoundsBuf.getDevicePtr(), updateData.getAABBs(), boundsSize, mStream);
|
|
}
|
|
mCudaContext->memcpyHtoDAsync(mBoxContactDistancesBuf.getDevicePtr(), updateData.getContactDistance(), distanceSize, mStream);
|
|
mCudaContext->memcpyHtoDAsync(mBoxGroupsBuf.getDevicePtr(), updateData.getGroups(), groupSize, mStream);
|
|
if(updateData.getEnvIDs())
|
|
mCudaContext->memcpyHtoDAsync(mBoxEnvIDsBuf.getDevicePtr(), updateData.getEnvIDs(), envIDSize, mStream);
|
|
}
|
|
|
|
void PxgCudaBroadPhaseSap::fetchBroadPhaseResults()
|
|
{
|
|
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.fetchBroadPhaseResults", mContextID);
|
|
|
|
PxScopedCudaLock _lock_(*mCudaContextManager);
|
|
|
|
gpuDMABack(*mBpDesc);
|
|
|
|
//purgeDuplicateFoundPairs();
|
|
//purgeDuplicateLostPairs();
|
|
|
|
// flip double buffer
|
|
{
|
|
for(PxU32 i=0; i<3; ++i)
|
|
{
|
|
const PxU32 swapId = i + 3;
|
|
PxgCudaBuffer::swapBuffer(mBoxPtHandlesBuf[i], mBoxPtHandlesBuf[swapId]);
|
|
PxgCudaBuffer::swapBuffer(mBlockEndPtHistogramBuf[i], mBlockEndPtHistogramBuf[swapId]);
|
|
PxgCudaBuffer::swapBuffer(mEndPtHistogramBuf[i], mEndPtHistogramBuf[swapId]);
|
|
PxgCudaBuffer::swapBuffer(mEndPtHandleBuf[i], mEndPtHandleBuf[swapId]);
|
|
|
|
PxgCudaBuffer::swapBuffer(mBlockStartPtHistogramBuf[i], mBlockStartPtHistogramBuf[swapId]);
|
|
PxgCudaBuffer::swapBuffer(mStartPtHistogramBuf[i], mStartPtHistogramBuf[swapId]);
|
|
PxgCudaBuffer::swapBuffer(mStartPtHandleBuf[i], mStartPtHandleBuf[swapId]);
|
|
}
|
|
}
|
|
}
|