Files
XCEngine/engine/third_party/physx/source/gpubroadphase/src/PxgCudaBroadPhaseSap.cpp

1280 lines
55 KiB
C++

// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#include "foundation/PxAllocator.h"
#include "foundation/PxTime.h"
#include "foundation/PxMemory.h"
#include "foundation/PxSort.h"
#include "common/PxProfileZone.h"
#include "PxvSimStats.h"
#include "PxgCudaBroadPhaseSap.h"
#include "PxgBroadPhaseKernelIndices.h"
#include "PxgIntegerAABB.h"
#include "PxgBroadPhasePairReport.h"
#include "BpBroadPhaseUpdate.h"
#include "PxgSapBox1D.h"
#include "PxgRadixSortDesc.h"
#include "PxgCudaMemoryAllocator.h"
#include "PxgKernelWrangler.h"
#include "PxgKernelIndices.h"
#include "PxSceneDesc.h"
#include "PxgCudaUtils.h"
#include "PxgRadixSortKernelIndices.h"
#include "PxgAABBManager.h"
#include "PxgContext.h"
#include "PxgSimulationCore.h"
#include "CudaKernelWrangler.h"
#include "cudamanager/PxCudaContext.h"
#include "cudamanager/PxCudaContextManager.h"
#include "PxgKernelLauncher.h"
// PT: TODO:
// - most of these functions don't need to be member functions
#define GPU_BP_DEBUG 0
#define USE_NEW_LAUNCH_FUNCTION 1
#if GPU_BP_DEBUG
#define GPU_DEBUG_STREAM(s, x) \
{ \
const CUresult err = mCudaContext->streamSynchronize(s); \
if(err != CUDA_SUCCESS) \
outputError<PxErrorCode::eINTERNAL_ERROR>(__LINE__, x); \
}
#else
#define GPU_DEBUG_STREAM(s, x)
#endif
#define PROLOG mGpuKernelWranglerManager->mKernelWrangler, mCudaContext
#if USE_NEW_LAUNCH_FUNCTION
#define KERNEL_PARAM_TYPE void*
#define CUDA_KERNEL_PARAM PX_CUDA_KERNEL_PARAM2
#define EPILOG mStream, kernelParams, PX_FL
#else
#define KERNEL_PARAM_TYPE PxCudaKernelParam
#define CUDA_KERNEL_PARAM PX_CUDA_KERNEL_PARAM
#define EPILOG mStream, kernelParams, sizeof(kernelParams), PX_FL
#endif
using namespace physx;
PX_IMPLEMENT_OUTPUT_ERROR
PxgCudaBroadPhaseSap::PxgCudaBroadPhaseSap(const PxGpuBroadPhaseDesc& desc, PxgCudaKernelWranglerManager* gpuKernelWrangler, PxCudaContextManager* cudaContextManager, const PxGpuDynamicsMemoryConfig& init, PxgHeapMemoryAllocatorManager* heapMemoryManager, PxU64 contextID) :
Bp::BroadPhase (),
mContextID (contextID),
mDesc (desc),
mNumOfBoxes (0),
mUpdateData_CreatedHandleSize (0),
mUpdateData_RemovedHandleSize (0),
#ifdef SUPPORT_UPDATE_HANDLES_ARRAY_FOR_GPU
mUpdateData_UpdatedHandleSize (0),
#endif
mUpdateData_BoxesCapacity (0),
mGpuKernelWranglerManager (gpuKernelWrangler),
mCudaContextManager (cudaContextManager),
mCudaContext (cudaContextManager->getCudaContext()),
mHeapMemoryManager (heapMemoryManager),
mCreatedHandlesBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mRemovedHandlesBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
#ifdef SUPPORT_UPDATE_HANDLES_ARRAY_FOR_GPU
// PT: looks like this stuff used to be here but got removed for some reason!
mUpdatedHandlesBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
#endif
mBoxFpBoundsBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mBoxContactDistancesBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mBoxGroupsBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mBoxEnvIDsBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mNewIntegerBoundsBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mOldIntegerBoundsBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mBoxPtProjectionsBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mBoxProjectionRanksBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mBoxPtHandlesBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mTempBoxPtProjectionBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mTempBoxPtHandlesBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mRadixCountBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mBoxSapBox1DBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mNewBoxSapBox1DBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mEndPtHistogramBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mBlockEndPtHistogramBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mEndPtHandleBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mStartPtHistogramBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mBlockStartPtHistogramBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mStartPtHandleBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mTotalEndPtHistogramBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mBlockTotalEndPtHistogramBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mActiveRegionTotalBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mStartRegionsTotalBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mOrderedActiveRegionHandlesTotalBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mOrderedStartRegionHandlesTotalBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mOverlapChecksRegionBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mBlockOverlapChecksRegionBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mOverlapChecksHandleRegionBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mIncrementalComparisons (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mIncrementalBlockComparisons (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mAggregateReportBlockBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mActorReportBlockBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mRegionRangeBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mStartRegionAccumBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mBlockStartRegionAccumBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mRegionAccumBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mBlockRegionAccumBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mFoundPairsBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mLostPairsBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mFoundAggregateBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mLostAggregateBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mFoundActorBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mLostActorBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mBPDescBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mRadixSortDescBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mRadixSortWORDescBuf (heapMemoryManager, PxsHeapStats::eBROADPHASE),
mPinnedEvent (NULL),
mBpDesc (NULL),
mRSDesc (NULL),
mRSDescWOR (NULL),
mFoundActorPairs (PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators, PxsHeapStats::eBROADPHASE)),
mLostActorPairs (PxVirtualAllocator(heapMemoryManager->mMappedMemoryAllocators, PxsHeapStats::eBROADPHASE)),
mMaxFoundLostPairs (init.foundLostPairsCapacity),
mMaxAggFoundLostPairs (init.foundLostAggregatePairsCapacity),
mAABBManager (NULL),
#if PX_ENABLE_SIM_STATS
mFoundLostPairsStats (0),
#else
PX_CATCH_UNDEFINED_ENABLE_SIM_STATS
#endif
mForceUpdate (true)
{
PxScopedCudaLock _lock_(*mCudaContextManager);
for(PxU32 i = 0; i < 3; ++i)
mRadixCountBuf[i].allocate(sizeof(PxU32) * PxgRadixSortKernelGridDim::RADIX_SORT * 16, PX_FL);
mBPDescBuf.allocate(sizeof(PxgBroadPhaseDesc), PX_FL);
mRadixSortDescBuf.allocate(sizeof(PxgRadixSortDesc)*6, PX_FL);
mRadixSortWORDescBuf.allocate(sizeof(PxgRadixSortDesc)*6, PX_FL);
mBpDesc = reinterpret_cast<PxgBroadPhaseDesc*>(mHeapMemoryManager->mMappedMemoryAllocators->allocate(sizeof(PxgBroadPhaseDesc), PxsHeapStats::eBROADPHASE, PX_FL));
mRSDesc = reinterpret_cast<PxgRadixSortDesc*>(mHeapMemoryManager->mMappedMemoryAllocators->allocate(sizeof(PxgRadixSortDesc) * 6, PxsHeapStats::eBROADPHASE, PX_FL));
mRSDescWOR = reinterpret_cast<PxgRadixSortDesc*>(mHeapMemoryManager->mMappedMemoryAllocators->allocate(sizeof(PxgRadixSortDesc) * 6, PxsHeapStats::eBROADPHASE, PX_FL));
mRegionAccumTotal = 0;
mOverlapChecksTotalRegion = 0;
mStartRegionAccumTotal = 0;
mFoundPairsBuf.allocate(mMaxFoundLostPairs * sizeof(PxgBroadPhasePair), PX_FL);
mLostPairsBuf.allocate(mMaxFoundLostPairs * sizeof(PxgBroadPhasePair), PX_FL);
mFoundAggregateBuf.allocate(mMaxAggFoundLostPairs * sizeof(PxgBroadPhasePair), PX_FL);
mLostAggregateBuf.allocate(mMaxAggFoundLostPairs * sizeof(PxgBroadPhasePair), PX_FL);
mFoundActorBuf.allocate(mMaxFoundLostPairs * sizeof(PxgBroadPhasePair), PX_FL);
mLostActorBuf.allocate(mMaxFoundLostPairs * sizeof(PxgBroadPhasePair), PX_FL);
mFoundActorPairs.forceSize_Unsafe(0);
mFoundActorPairs.reserve(mMaxFoundLostPairs);
mLostActorPairs.forceSize_Unsafe(0);
mLostActorPairs.reserve(mMaxFoundLostPairs);
createGpuStreamsAndEvents();
}
PxgCudaBroadPhaseSap::~PxgCudaBroadPhaseSap()
{
PxScopedCudaLock _lock_(*mCudaContextManager);
mHeapMemoryManager->mMappedMemoryAllocators->deallocate(mBpDesc);
mHeapMemoryManager->mMappedMemoryAllocators->deallocate(mRSDesc);
mHeapMemoryManager->mMappedMemoryAllocators->deallocate(mRSDescWOR);
releaseGpuStreamsAndEvents();
}
void PxgCudaBroadPhaseSap::release()
{
this->~PxgCudaBroadPhaseSap();
PX_FREE_THIS;
}
void PxgCudaBroadPhaseSap::createGpuStreamsAndEvents()
{
int leastPriority, mostPriority;
cuCtxGetStreamPriorityRange(&leastPriority, &mostPriority);
CUresult result = mCudaContext->streamCreateWithPriority(&mStream, CU_STREAM_NON_BLOCKING, mostPriority);
if (result != CUDA_SUCCESS)
outputError<PxErrorCode::eINTERNAL_ERROR>(__LINE__, "GPU Create Stream 0 fail!!\n");
result = mCudaContext->eventCreate(&mEvent, CU_EVENT_DISABLE_TIMING);
mPinnedEvent = PX_PINNED_MEMORY_ALLOC(PxU32, *mCudaContextManager, 1);
if (result != CUDA_SUCCESS)
outputError<PxErrorCode::eINTERNAL_ERROR>(__LINE__, "GPU Create Event 0 fail!!\n");
}
void PxgCudaBroadPhaseSap::releaseGpuStreamsAndEvents()
{
//destroy stream
mCudaContext->streamDestroy(mStream);
mStream = NULL;
PX_PINNED_MEMORY_FREE(*mCudaContextManager, mPinnedEvent);
//destroy event
mCudaContext->eventDestroy(mEvent);
mEvent = NULL;
}
void PxgCudaBroadPhaseSap::gpuDMAUp(const Bp::BroadPhaseUpdateData& updateData, PxgBroadPhaseDesc& bpDesc, PxgRadixSortDesc* rsDescs)
{
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.gpuDMAUp", mContextID);
//mCudaContext->memcpyHtoDAsync(constraintsPerPartitiond, constraintsPerPartitionIter.begin(), sizeof(PxU32) * numConstraintsPerPartition, mStream);
mUpdateData_RemovedHandleSize = updateData.getNumRemovedHandles();
mUpdateData_CreatedHandleSize = updateData.getNumCreatedHandles();
#ifdef SUPPORT_UPDATE_HANDLES_ARRAY_FOR_GPU
mUpdateData_UpdatedHandleSize = updateData.getNumUpdatedHandles();
#endif
//mContactDistances = updateData.getContactDistance();
//mBoxBoundsMinMax = updateData.getAABBs();
//mBoxGroups = updateData.getGroups();
mUpdateData_BoxesCapacity = updateData.getCapacity();
mNumOfBoxes = mNumOfBoxes + mUpdateData_CreatedHandleSize - mUpdateData_RemovedHandleSize;
//We need to add on removedHandleSize because these handles are temporarily also in the projection buffer
const PxU32 nbProjections = (mNumOfBoxes + mUpdateData_RemovedHandleSize) * 2;
const PxU32 paddedProjections = (nbProjections + 3)&(~3);
mOldIntegerBoundsBuf.allocateCopyOldDataAsync(mUpdateData_BoxesCapacity * sizeof(PxgIntegerAABB), mCudaContext, mStream, PX_FL);
//we need to allocate enough memory (x4) for the radix sort because each thread read 4 elements
for (PxU32 i = 0; i < 3; ++i)
{
mBoxSapBox1DBuf[i].allocateCopyOldDataAsync(mUpdateData_BoxesCapacity * sizeof(PxgSapBox1D), mCudaContext, mStream, PX_FL);
mNewBoxSapBox1DBuf[i].allocate(mUpdateData_BoxesCapacity * sizeof(PxgSapBox1D), PX_FL);
mBoxPtProjectionsBuf[i].allocateCopyOldDataAsync(paddedProjections * sizeof(int), mCudaContext, mStream, PX_FL);
mBoxProjectionRanksBuf[i].allocate(paddedProjections * sizeof(int), PX_FL);
mTempBoxPtProjectionBuf[i].allocate(paddedProjections * sizeof(int), PX_FL);
mTempBoxPtHandlesBuf[i].allocate(paddedProjections * sizeof(int), PX_FL);
for (PxU32 j = 0; j < 2; ++j)
{
//mEndPtHistogramBuf[j][i].allocateCopyOldDataAsync(nbProjections * sizeof(int), mStreams.begin(), mHeapMemoryManager);
//mBlockEndPtHistogramBuf[j][i].allocateCopyOldDataAsync(PxgBPKernelGridDim::BP_OUTPUT_ENDPT_HISTOGRAM * sizeof(int), mStreams.begin(), mHeapMemoryManager); // 32 block
//mEndPtHandleBuf[j][i].allocateCopyOldDataAsync(mNumOfBoxes*sizeof(int), mStreams.begin(), mHeapMemoryManager);
//mStartPtHistogramBuf[j][i].allocateCopyOldDataAsync(nbProjections * sizeof(int), mStreams.begin(), mHeapMemoryManager);
//mBlockStartPtHistogramBuf[j][i].allocateCopyOldDataAsync(PxgBPKernelGridDim::BP_OUTPUT_ENDPT_HISTOGRAM * sizeof(int), mStreams.begin(), mHeapMemoryManager); // 32 block
//mStartPtHandleBuf[j][i].allocateCopyOldDataAsync(mNumOfBoxes*sizeof(int), mStreams.begin(), mHeapMemoryManager);
const PxU32 index = j * 3 + i;
mBoxPtHandlesBuf[index].allocateCopyOldDataAsync(paddedProjections * sizeof(int), mCudaContext, mStream, PX_FL);
mEndPtHistogramBuf[index].allocateCopyOldDataAsync(nbProjections * sizeof(int), mCudaContext, mStream, PX_FL);
mBlockEndPtHistogramBuf[index].allocateCopyOldDataAsync(PxgBPKernelGridDim::BP_OUTPUT_ENDPT_HISTOGRAM * sizeof(int), mCudaContext, mStream, PX_FL); // 32 block
mEndPtHandleBuf[index].allocateCopyOldDataAsync(mNumOfBoxes*sizeof(int), mCudaContext, mStream, PX_FL);
mStartPtHistogramBuf[index].allocateCopyOldDataAsync(nbProjections * sizeof(int), mCudaContext, mStream, PX_FL);
mBlockStartPtHistogramBuf[index].allocateCopyOldDataAsync(PxgBPKernelGridDim::BP_OUTPUT_ENDPT_HISTOGRAM * sizeof(int), mCudaContext, mStream, PX_FL); // 32 block
mStartPtHandleBuf[index].allocateCopyOldDataAsync(mNumOfBoxes*sizeof(int), mCudaContext, mStream, PX_FL);
}
mIncrementalComparisons[i].allocate(nbProjections* sizeof(int), PX_FL);
mIncrementalBlockComparisons[i].allocate(PxgBPKernelGridDim::BP_COMPUTE_INCREMENTAL_CMP_COUNTS1 * sizeof(int), PX_FL);
mTotalEndPtHistogramBuf[i].allocate(nbProjections*sizeof(PxU32), PX_FL);
mBlockTotalEndPtHistogramBuf[i].allocate(PxgBPKernelGridDim::BP_OUTPUT_ENDPT_HISTOGRAM*sizeof(PxU32), PX_FL);
}
for (PxU32 i = 0; i < 2; ++i)
{
mAggregateReportBlockBuf[i].allocate(32 * sizeof(PxU32), PX_FL);
mActorReportBlockBuf[i].allocate(32 * sizeof(PxU32), PX_FL);
}
//each thread read 4 elements so we need to allocate enough memory for it
const PxU32 totalNbProjectionRegions = (nbProjections * 64 + 3)&(~3);
mActiveRegionTotalBuf.allocate(totalNbProjectionRegions * sizeof(int), PX_FL);
mStartRegionsTotalBuf.allocate(totalNbProjectionRegions * sizeof(int), PX_FL);
mOrderedActiveRegionHandlesTotalBuf.allocate(totalNbProjectionRegions * sizeof(int), PX_FL);
mOrderedStartRegionHandlesTotalBuf.allocate(totalNbProjectionRegions * sizeof(int), PX_FL);
mOverlapChecksRegionBuf.allocate(64 * mNumOfBoxes * sizeof(regionOverlapType), PX_FL);
mBlockOverlapChecksRegionBuf.allocate(PxgBPKernelGridDim::BP_OUTPUT_OVERLAPCHECKS_HISTOGRAM * sizeof(regionOverlapType), PX_FL);
mOverlapChecksHandleRegionBuf.allocate(64 * mNumOfBoxes * sizeof(PxgHandleRegion), PX_FL);
mRegionRangeBuf.allocate(mUpdateData_BoxesCapacity * sizeof(PxgIntegerRegion), PX_FL);
mStartRegionAccumBuf.allocate(nbProjections * sizeof(int), PX_FL);
mBlockStartRegionAccumBuf.allocate(PxgBPKernelGridDim::BP_OUTPUT_START_REGION_HISTOGRAM * sizeof(int), PX_FL);
mRegionAccumBuf.allocate(nbProjections * sizeof(int), PX_FL);
mBlockRegionAccumBuf.allocate(PxgBPKernelGridDim::BP_OUTPUT_REGION_HISTOGRAM * sizeof(int), PX_FL);
//allocate enough memory for GPU. All this data is input from the AABB manager this frame
mCreatedHandlesBuf.allocate(mUpdateData_CreatedHandleSize * sizeof(PxU32), PX_FL);
mRemovedHandlesBuf.allocate(mUpdateData_RemovedHandleSize * sizeof(PxU32), PX_FL);
#ifdef SUPPORT_UPDATE_HANDLES_ARRAY_FOR_GPU
mUpdatedHandlesBuf.allocate(mUpdateData_UpdatedHandleSize * sizeof(PxU32), PX_FL);
#endif
mNewIntegerBoundsBuf.allocate(mUpdateData_BoxesCapacity * sizeof(PxgIntegerAABB), PX_FL);
//mBoxFpBoundsBuf and mBoxContactDistancesBuf need to be allocated before particle updateBound kernel
//we move the allocation to gpuDmaUpSharedData. Particle system don't need mBoxGroupsBuf. However, if
//we dma up those three buffers based on the state changed so it will make sense to group those buffer together
//mBoxFpBoundsBuf.allocate(mBoxesCapacity * sizeof(PxBounds3));
//mBoxContactDistancesBuf.allocate(mBoxesCapacity * sizeof(PxReal));
//mBoxGroupsBuf.allocate(mBoxesCapacity * sizeof(PxU32));
updateDescriptor(bpDesc);
updateRadixSortDesc(rsDescs);
//DMA the update data to GPU
mCudaContext->memcpyHtoDAsync(mCreatedHandlesBuf.getDevicePtr(), updateData.getCreatedHandles(), sizeof(int) * mUpdateData_CreatedHandleSize, mStream);
mCudaContext->memcpyHtoDAsync(mRemovedHandlesBuf.getDevicePtr(), updateData.getRemovedHandles(), sizeof(int) * mUpdateData_RemovedHandleSize, mStream);
#ifdef SUPPORT_UPDATE_HANDLES_ARRAY_FOR_GPU
mCudaContext->memcpyHtoDAsync(mUpdatedHandlesBuf.getDevicePtr(), updateData.getUpdatedHandles(), sizeof(int) * mUpdateData_UpdatedHandleSize, mStream);
#endif
/*if(updateData.getStateChanged())
{
mCudaContext->memcpyHtoDAsync(mBoxContactDistancesBuf.getDevicePtr(), mContactDistances, sizeof(PxReal)* mBoxesCapacity, mStream);
mCudaContext->memcpyHtoDAsync(mBoxGroupsBuf.getDevicePtr(), mBoxGroups, sizeof(PxU32)* mBoxesCapacity, mStream);
mCudaContext->memcpyHtoDAsync(mBoxFpBoundsBuf.getDevicePtr(), mBoxBoundsMinMax, sizeof(PxBounds3)* mBoxesCapacity, mStream);
}*/
mCudaContext->memcpyHtoDAsync(mBPDescBuf.getDevicePtr(), (void*)&bpDesc, sizeof(PxgBroadPhaseDesc), mStream);
mCudaContext->memcpyHtoDAsync(mRadixSortDescBuf.getDevicePtr(), rsDescs, sizeof(PxgRadixSortDesc)*6, mStream);
mCudaContext->memcpyHtoDAsync(mRadixSortWORDescBuf.getDevicePtr(), mRSDescWOR, sizeof(PxgRadixSortDesc) * 6, mStream);
/*PxCudaStreamFlush(mStreams.begin());*/
#if GPU_BP_DEBUG
GPU_DEBUG_STREAM(mStream, "GPU radix sort fail!!\n")
mCudaContext->memcpyDtoH((void*)&bpDesc, mBPDescBuf.getDevicePtr(), sizeof(PxgBroadPhaseDesc));
#endif
}
void PxgCudaBroadPhaseSap::freeBuffers()
{
mLostActorPairs.forceSize_Unsafe(0);
mFoundActorPairs.forceSize_Unsafe(0);
}
void PxgCudaBroadPhaseSap::runCopyResultsKernel(PxgBroadPhaseDesc& /*desc*/)
{
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.runCopyResultsKernel", mContextID);
CUdeviceptr bpBuff = mBPDescBuf.getDevicePtr();
{
KERNEL_PARAM_TYPE kernelParams[] = { CUDA_KERNEL_PARAM(bpBuff) };
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_ACCUMULATE_REPORT_STAGE_1, PxgBPKernelGridDim::BP_COMPUTE_INCREMENTAL_CMP_COUNTS1, 4, 1, PxgBPKernelBlockDim::BP_COMPUTE_INCREMENTAL_CMP_COUNTS1, 1, 1, 0, EPILOG);
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_ACCUMULATE_REPORT_STAGE_2, PxgBPKernelGridDim::BP_COMPUTE_INCREMENTAL_CMP_COUNTS2, 4, 1, PxgBPKernelBlockDim::BP_COMPUTE_INCREMENTAL_CMP_COUNTS2, 1, 1, 0, EPILOG);
#if GPU_BP_DEBUG
/*mCudaContext->memcpyDtoHAsync((void*)&desc, mBPDescBuf.getDevicePtr(), sizeof(PxgBroadPhaseDesc), mStream);
resultR = mCudaContext->streamSynchronize(mStream);*/
#endif
}
{
KERNEL_PARAM_TYPE kernelParams[] = { CUDA_KERNEL_PARAM(bpBuff) };
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_COPY_REPORTS, PxgBPKernelGridDim::BP_COPY_REPORTS, 1, 1, PxgBPKernelBlockDim::BP_COPY_REPORTS, 1, 1, 0, EPILOG);
}
}
void PxgCudaBroadPhaseSap::gpuDMABack(const PxgBroadPhaseDesc& desc)
{
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.gpuDMABack", mContextID);
//mCudaContext->eventRecord(mEvent, mStream);
//PxCudaStreamFlush(mStreams.begin()); //KS - dispatch work!
{
CUdeviceptr bpBuff = mBPDescBuf.getDevicePtr();
mCudaContext->memcpyDtoHAsync((void*)&desc, bpBuff, sizeof(PxgBroadPhaseDesc), mStream);
//resultR = mCudaContext->streamSynchronize(mStream);
void* devicePtr = getMappedDevicePtr(mCudaContext, mPinnedEvent);
KERNEL_PARAM_TYPE kernelParams[] = { CUDA_KERNEL_PARAM(devicePtr) };
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_SIGNAL_COMPLETE, 1, 1, 1, 1, 1, 1, 0, EPILOG);
mCudaContext->streamFlush(mStream);
}
{
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.Synchronize", mContextID);
//mCudaContext->streamSynchronize(mStream);
volatile PxU32* eventPtr = mPinnedEvent;
if (!spinWait(*eventPtr, 0.1f))
mCudaContext->streamSynchronize(mStream);
}
mOverlapChecksTotalRegion = desc.overlapChecksTotalRegion;
mStartRegionAccumTotal = desc.startRegionAccumTotal;
mRegionAccumTotal = desc.regionAccumTotal;
// AD: some explanation about the counts here - just to reiterate:
//
// internally we have two lists, found pairs and lost pairs. Both contain actor-actor, actor-aggregate and
// aggregate-aggregate pairs. In the "report" phase, these two lists are built such that all the pairs involving
// aggregates are first, and actor pairs are after that.
//
// desc.sharedFound/LostPairIndex is the total size of the lists.
// desc.sharedFound/LostAggPairIndex is the number of aggregates in the list.
//
// but it gets more complicated. These counts overflow, but internally we only write until the maxLostFoundPairs
// value to make sure we're not going out of bounds. So the total value in the descriptor is only really useful
// if we are below the max, otherwise we need to correct.
//
// so in the end, the final number of pairs is PxMin(mMaxLostFoundPairs, desc.sharedFoundPairIndex) - desc.sharedFoundAggPairIndex.
// This works because the aggregate index is always smaller than the max index.
PX_ASSERT(desc.sharedFoundPairIndex >= desc.sharedFoundAggPairIndex);
PX_ASSERT(desc.sharedLostPairIndex >= desc.sharedLostAggPairIndex);
PxU32 foundLostPairsNeeded = PxMax(desc.sharedFoundPairIndex, desc.sharedLostPairIndex);
#if PX_ENABLE_SIM_STATS
mFoundLostPairsStats = PxMax(mFoundLostPairsStats, foundLostPairsNeeded);
#else
PX_CATCH_UNDEFINED_ENABLE_SIM_STATS
#endif
if (desc.found_lost_pairs_overflow_flags)
{
PxGetFoundation().error(PxErrorCode::eINVALID_PARAMETER, PX_FL,
"The application needs to increase PxGpuDynamicsMemoryConfig::foundLostPairsCapacity to %i, otherwise, the simulation will miss interactions\n", foundLostPairsNeeded);
}
mFoundActorPairs.forceSize_Unsafe(PxMin(mMaxFoundLostPairs, desc.sharedFoundPairIndex) - desc.sharedFoundAggPairIndex);
mLostActorPairs.forceSize_Unsafe(PxMin(mMaxFoundLostPairs, desc.sharedLostPairIndex) - desc.sharedLostAggPairIndex);
// AD: safety in case copyReports did not run due to abort mode
if (mCudaContext->isInAbortMode())
{
mFoundActorPairs.forceSize_Unsafe(0);
mLostActorPairs.forceSize_Unsafe(0);
}
}
struct ReportMore
{
bool operator()(const PxgBroadPhasePair& left, const PxgBroadPhasePair& right) const
{
return (left.mVolA > right.mVolA) ||
((left.mVolA == right.mVolA) && (left.mVolB > right.mVolB));
}
};
/*bool hasDuplicates(PxPinnedArray<PxgBroadPhasePair>& iterator)
{
for(PxU32 a = 1; a < iterator.size(); ++a)
{
PX_ASSERT(iterator[a].mVolA != iterator[a-1].mVolA ||
iterator[a].mVolB != iterator[a-1].mVolB);
}
return false;
}*/
void PxgCudaBroadPhaseSap::sortBuffer(PxgBroadPhasePair* PX_RESTRICT reportBuffer, const PxU32 size)
{
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap::sortBuffer", mContextID);
#if 1
PxSort(reportBuffer, size, ReportMore());
#else
const PxU32 SmallBufferLimit = 512;
if (size < SmallBufferLimit)
PxSort(reportBuffer, size, ReportMore());
else
{
//Histogram sort...
mHistogramBuffer.forceSize_Unsafe(0);
mHistogramBuffer.reserve(mBoxesCapacity);
mHistogramBuffer.forceSize_Unsafe(mBoxesCapacity);
PxMemZero(mHistogramBuffer.begin(), sizeof(PxU32) * mBoxesCapacity);
mTempPairBuffer.forceSize_Unsafe(0);
mTempPairBuffer.reserve(size);
mTempPairBuffer.forceSize_Unsafe(size);
for (PxU32 a = 0; a < size; ++a)
{
++mHistogramBuffer[reportBuffer[a].mVolA];
}
//Compute runsum
PxU32 runsum = 0;
PxU32 boxesCapacity = mBoxesCapacity;
//for (PxU32 a = 0; a < mBoxesCapacity; ++a)
while(boxesCapacity--)
{
PxU32 value = mHistogramBuffer[boxesCapacity];
mHistogramBuffer[boxesCapacity] = runsum;
runsum += value;
}
for (PxU32 a = 0; a < size; ++a)
{
PxU32 idx = mHistogramBuffer[reportBuffer[a].mVolA]++;
mTempPairBuffer[idx] = reportBuffer[a];
}
PxMemZero(mHistogramBuffer.begin(), sizeof(PxU32) * mBoxesCapacity);
for (PxU32 a = 0; a < size; ++a)
{
++mHistogramBuffer[reportBuffer[a].mVolB];
}
runsum = 0;
boxesCapacity = mBoxesCapacity;
//for (PxU32 a = 0; a < mBoxesCapacity; ++a)
while(boxesCapacity--)
{
PxU32 value = mHistogramBuffer[boxesCapacity];
mHistogramBuffer[boxesCapacity] = runsum;
runsum += value;
}
for (PxU32 a = 0; a < size; ++a)
{
PxU32 idx = mHistogramBuffer[mTempPairBuffer[a].mVolB]++;
reportBuffer[idx] = mTempPairBuffer[a];
}
}
#endif
}
void PxgCudaBroadPhaseSap::purgeDuplicates(PxPinnedArray<PxgBroadPhasePair>& pairs)
{
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.purgeDuplicates", mContextID);
const PxU32 nbPairs = pairs.size();
sortBuffer(pairs.begin(), nbPairs);
if (nbPairs)
{
PxU32 actor0 = pairs[0].mVolA;
PxU32 actor1 = pairs[0].mVolB;
PxU32 count = 1;
for (PxU32 i = 1; i < nbPairs; i++)
{
PxgBroadPhasePair& report1 = pairs[i];
PxU32 newActor0 = report1.mVolA;
PxU32 newActor1 = report1.mVolB;
if (newActor0 != actor0 || newActor1 != actor1)
{
if (count != i)
{
pairs[count].mVolA = newActor0;
pairs[count].mVolB = newActor1;
}
actor0 = newActor0;
actor1 = newActor1;
count++;
}
}
pairs.forceSize_Unsafe(count);
}
}
void PxgCudaBroadPhaseSap::purgeDuplicateFoundPairs()
{
purgeDuplicates(mFoundActorPairs);
}
void PxgCudaBroadPhaseSap::purgeDuplicateLostPairs()
{
purgeDuplicates(mLostActorPairs);
}
void PxgCudaBroadPhaseSap::runRadixSort(const PxU32 numOfKeys, CUdeviceptr radixSortDescBuf)
{
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.runRadixSort", mContextID);
PxU32 startBit = 0;
const PxU32 numPass = 8;
for(PxU32 i=0; i<numPass; ++i)
{
const PxU32 descIndex = (i & 1)*3;
CUdeviceptr rsDesc = radixSortDescBuf + descIndex*sizeof(PxgRadixSortDesc);
KERNEL_PARAM_TYPE kernelParams[] = { CUDA_KERNEL_PARAM(rsDesc), CUDA_KERNEL_PARAM(numOfKeys), CUDA_KERNEL_PARAM(startBit) };
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::RS_MULTIBLOCK_COUNT, PxgRadixSortKernelGridDim::RADIX_SORT, 3, 1, PxgRadixSortKernelBlockDim::RADIX_SORT, 1, 1, 0, EPILOG);
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::RS_CALCULATERANKS_MULTIBLOCK_COUNT, PxgRadixSortKernelGridDim::RADIX_SORT, 3, 1, PxgRadixSortKernelBlockDim::RADIX_SORT, 1, 1, 0, EPILOG);
startBit+=4;
}
GPU_DEBUG_STREAM(mStream, "GPU radix sort fail!!\n")
}
void PxgCudaBroadPhaseSap::sortProjectionAndHandlesWRKernel(PxU32 previousNumOfBoxes)
{
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.sortProjectionAndHandlesWRKernel", mContextID);
//PxU32 numHandles = mPreviousNumOfBoxes + mCreatedHandleSize;
//const PxU32 numHandles = mPreviousNumOfBoxes;
const PxU32 numHandles = previousNumOfBoxes;
if(numHandles == 0)
return;
PxU32 nbProjections = numHandles*2;
//we need to pad the number of projection to the multiply of 4
nbProjections = (nbProjections + 3) & (~3);
CUdeviceptr bpBuff = mBPDescBuf.getDevicePtr();
KERNEL_PARAM_TYPE kernelParams[] = { CUDA_KERNEL_PARAM(bpBuff) };
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_INITIALIZE_RANKS, PxgBPKernelGridDim::BP_INITIALIZE_RANKS, 1, 1, PxgBPKernelBlockDim::BP_INITIALIZE_RANKS, 1, 1, 0, EPILOG);
runRadixSort(nbProjections, mRadixSortDescBuf.getDevicePtr());
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_UDPATE_HANDLES, PxgBPKernelGridDim::BP_UDPATE_HANDLES, 1, 1, PxgBPKernelBlockDim::BP_UDPATE_HANDLES, 1, 1, 0, EPILOG);
}
//sort projections and handles without ranks
void PxgCudaBroadPhaseSap::sortProjectionAndHandlesWORKernel(PxU32 previousNumOfBoxes)
{
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.sortProjectionAndHandlesWORKernel", mContextID);
const PxU32 numHandles = previousNumOfBoxes + mUpdateData_CreatedHandleSize;
//const PxU32 numHandles = mPreviousNumOfBoxes + mUpdateData_CreatedHandleSize;
if(numHandles == 0)
return;
PxU32 nbProjections = numHandles*2;
//we need to pad the number of projection to the multiply of 4
nbProjections = (nbProjections + 3) & (~3);
runRadixSort(nbProjections, mRadixSortWORDescBuf.getDevicePtr());
GPU_DEBUG_STREAM(mStream, "GPU radix sort fail!!\n")
}
// PT:
// In:
// bpDesc->boxHandles
// Out:
// bpDesc->boxNewSapBox1D or bpDesc->boxSapBox1D
void PxgCudaBroadPhaseSap::initializeSapBoxKernel(const PxU32 numHandles, bool isNew)
{
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.initializeSapBoxKernel", mContextID);
const PxU32 nbBlocks = ((numHandles*2) + PxgBPKernelBlockDim::BP_INITIALIZE_SAPBOX-1)/ PxgBPKernelBlockDim::BP_INITIALIZE_SAPBOX;
if(nbBlocks)
{
CUdeviceptr bpDescd = mBPDescBuf.getDevicePtr();
KERNEL_PARAM_TYPE kernelParams[] = { CUDA_KERNEL_PARAM(bpDescd), CUDA_KERNEL_PARAM(numHandles), CUDA_KERNEL_PARAM(isNew) };
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_INITIALIZE_SAPBOX, nbBlocks, 1, 1, PxgBPKernelBlockDim::BP_INITIALIZE_SAPBOX, 1, 1, 0, EPILOG);
}
}
void PxgCudaBroadPhaseSap::translateAABBsKernel()
{
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.translateAABBsKernel", mContextID);
if(mUpdateData_BoxesCapacity == 0)
return;
const PxBounds3* updateData_fpBounds = reinterpret_cast<PxBounds3*>(mBoxFpBoundsBuf.getDevicePtr());
PxgIntegerAABB* newIntegerBounds = reinterpret_cast<PxgIntegerAABB*>(mNewIntegerBoundsBuf.getDevicePtr());
const PxReal* updateData_contactDistances = reinterpret_cast<PxReal*>(mBoxContactDistancesBuf.getDevicePtr());
const PxU32* updateData_envIDs = reinterpret_cast<PxU32*>(mBoxEnvIDsBuf.getDevicePtr());
KERNEL_PARAM_TYPE kernelParams[] = {
CUDA_KERNEL_PARAM(updateData_fpBounds),
CUDA_KERNEL_PARAM(newIntegerBounds),
CUDA_KERNEL_PARAM(updateData_contactDistances),
CUDA_KERNEL_PARAM(updateData_envIDs),
CUDA_KERNEL_PARAM(mUpdateData_BoxesCapacity),
CUDA_KERNEL_PARAM(mDesc.gpuBroadPhaseNbBitsShiftX),
CUDA_KERNEL_PARAM(mDesc.gpuBroadPhaseNbBitsShiftY),
CUDA_KERNEL_PARAM(mDesc.gpuBroadPhaseNbBitsShiftZ),
CUDA_KERNEL_PARAM(mDesc.gpuBroadPhaseNbBitsEnvIDX),
CUDA_KERNEL_PARAM(mDesc.gpuBroadPhaseNbBitsEnvIDY),
CUDA_KERNEL_PARAM(mDesc.gpuBroadPhaseNbBitsEnvIDZ)
};
const PxU32 aabbsPerBlock = PxgBPKernelBlockDim::BP_TRANSLATE_AABBS/8;
const PxU32 nbBlocks = (mUpdateData_BoxesCapacity + aabbsPerBlock-1)/aabbsPerBlock; // PT: do we really need mBoxesCapacity here?
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_TRANSLATE_AABBS, nbBlocks, 1, 1, PxgBPKernelBlockDim::BP_TRANSLATE_AABBS, 1, 1, 0, EPILOG);
}
void PxgCudaBroadPhaseSap::markRemovedPairsKernel()
{
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.markRemovedPairsKernel", mContextID);
if(mUpdateData_RemovedHandleSize)
{
CUdeviceptr bpDescd = mBPDescBuf.getDevicePtr();
KERNEL_PARAM_TYPE kernelParams[] = { CUDA_KERNEL_PARAM(bpDescd) };
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_MARK_DELETEDPAIRS, PxgBPKernelGridDim::BP_UPDATE_DELETEDPAIRS, 1, 1, PxgBPKernelBlockDim::BP_UPDATE_DELETEDPAIRS, 1, 1, 0, EPILOG);
}
}
void PxgCudaBroadPhaseSap::markRemovedPairsProjectionsKernel()
{
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.markRemovedPairsProjectionsKernel", mContextID);
if(mUpdateData_RemovedHandleSize)
{
CUdeviceptr bpDescd = mBPDescBuf.getDevicePtr();
KERNEL_PARAM_TYPE kernelParams[] = { CUDA_KERNEL_PARAM(bpDescd) };
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_UPDATE_DELETEDPAIRS, PxgBPKernelGridDim::BP_UPDATE_DELETEDPAIRS, 1, 1, PxgBPKernelBlockDim::BP_UPDATE_DELETEDPAIRS, 1, 1, 0, EPILOG);
}
}
void PxgCudaBroadPhaseSap::markUpdatedPairsKernel()
{
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.markUpdatedPairsKernel", mContextID);
//if(mUpdatedHandleSize != 0) // PT: TODO: why was this removed? ==> probably because the GPU code started reading from the AABB manager bitmap directly (which created the "evil coupling" we found before)
{
CUdeviceptr bpDescd = mBPDescBuf.getDevicePtr();
KERNEL_PARAM_TYPE kernelParams[] = { CUDA_KERNEL_PARAM(bpDescd) };
#ifdef SUPPORT_UPDATE_HANDLES_ARRAY_FOR_GPU
// PT: we need a new kernel to use this as a standalone BP and break the coupling between this class and the GPU AABB manager
if(mBpDesc->updateData_updatedHandles)
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_UPDATE_UPDATEDPAIRS2, PxgBPKernelGridDim::BP_UPDATE_UPDATEDPAIRS2, 1, 1, PxgBPKernelBlockDim::BP_UPDATE_UPDATEDPAIRS2, 1, 1, 0, EPILOG);
else
#endif
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_UPDATE_UPDATEDPAIRS, PxgBPKernelGridDim::BP_UPDATE_UPDATEDPAIRS, 1, 1, PxgBPKernelBlockDim::BP_UPDATE_UPDATEDPAIRS, 1, 1, 0, EPILOG);
}
}
// PT:
// In:
// bpDesc->numCreatedHandles
// bpDesc->numPreviousHandles
// bpDesc->updateData_createdHandles
// bpDesc->newIntegerBounds
// Out:
// bpDesc->boxProjections // copy of newIntegerBounds but split between X/Y/Z axes
// bpDesc->boxHandles // see createHandle(), will have link to CPU index & some flags
// bpDesc->oldIntegerBounds // Kernel will set old bounds of new objects to empty
//
// boxProjections & boxHandles are parallel arrays indexed by the GPU index
void PxgCudaBroadPhaseSap::markCreatedPairsKernel()
{
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.markCreatedPairsKernel", mContextID);
if(mUpdateData_CreatedHandleSize)
{
CUdeviceptr bpDescd = mBPDescBuf.getDevicePtr();
KERNEL_PARAM_TYPE kernelParams[] = { CUDA_KERNEL_PARAM(bpDescd) };
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_UPDATE_CREATEDPAIRS, PxgBPKernelGridDim::BP_UPDATE_CREATEDPAIRS, 1, 1, PxgBPKernelBlockDim::BP_UPDATE_CREATEDPAIRS, 1, 1, 0, EPILOG);
}
}
void PxgCudaBroadPhaseSap::calculateEndPtHistogramKernel(const bool isIncremental)
{
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.calculateEndPtHistogramKernel", mContextID);
CUdeviceptr bpDescd = mBPDescBuf.getDevicePtr();
KERNEL_PARAM_TYPE kernelParams[] = { CUDA_KERNEL_PARAM(bpDescd), CUDA_KERNEL_PARAM(isIncremental) };
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_COMPUTE_ENDPT_HISTOGRAM, PxgBPKernelGridDim::BP_COMPUTE_ENDPT_HISTOGRAM, 3, 1, PxgBPKernelBlockDim::BP_COMPUTE_ENDPT_HISTOGRAM, 1, 1, 0, EPILOG);
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_OUTPUT_ENDPT_HISTOGRAM, PxgBPKernelGridDim::BP_OUTPUT_ENDPT_HISTOGRAM, 3, 1, PxgBPKernelBlockDim::BP_OUTPUT_ENDPT_HISTOGRAM, 1, 1, 0, EPILOG);
}
void PxgCudaBroadPhaseSap::computeRegionHistogramKernel()
{
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.computeRegionHistogramKernel", mContextID);
if(mUpdateData_CreatedHandleSize)
{
const PxU32 nbProjections = (mNumOfBoxes + mUpdateData_RemovedHandleSize) * 2;
const PxU32 totalNbProjectionRegions = (nbProjections*64 + 3)&(~3);
//zero regions
mCudaContext->memsetD32Async(mActiveRegionTotalBuf.getDevicePtr(), 0, totalNbProjectionRegions, mStream);
mCudaContext->memsetD32Async(mStartRegionsTotalBuf.getDevicePtr(), 0, totalNbProjectionRegions, mStream);
CUdeviceptr bpDescd = mBPDescBuf.getDevicePtr();
KERNEL_PARAM_TYPE kernelParams[] = { CUDA_KERNEL_PARAM(bpDescd) };
//create regions
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_CREATE_REGIONS, PxgBPKernelGridDim::BP_CREATE_REGIONS, 1, 1, PxgBPKernelBlockDim::BP_CREATE_REGIONS, 1, 1, 0, EPILOG);
//compute start region histogram inside a block
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_COMPUTE_START_REGION_HISTOGRAM, PxgBPKernelGridDim::BP_COMPUTE_START_REGION_HISTOGRAM, 1, 1, PxgBPKernelBlockDim::BP_COMPUTE_START_REGION_HISTOGRAM, 1, 1, 0, EPILOG);
//compute start region histogram between blocks
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_OUTPUT_START_REGION_HISTOGRAM, PxgBPKernelGridDim::BP_OUTPUT_START_REGION_HISTOGRAM, 1, 1, PxgBPKernelBlockDim::BP_OUTPUT_START_REGION_HISTOGRAM, 1, 1, 0, EPILOG);
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_COMPUTE_REGION_HISTOGRAM, PxgBPKernelGridDim::BP_COMPUTE_REGION_HISTOGRAM, 1, 1, PxgBPKernelBlockDim::BP_COMPUTE_REGION_HISTOGRAM, 1, 1, 0, EPILOG);
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_OUTPUT_REGION_HISTOGRAM, PxgBPKernelGridDim::BP_OUTPUT_REGION_HISTOGRAM, 1, 1, PxgBPKernelBlockDim::BP_OUTPUT_REGION_HISTOGRAM, 1, 1, 0, EPILOG);
}
}
void PxgCudaBroadPhaseSap::computeStartAndActiveHistogramKernel()
{
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.computeStartAndActiveHistogramKernel", mContextID);
if(mUpdateData_CreatedHandleSize)
{
CUdeviceptr bpDescd = mBPDescBuf.getDevicePtr();
KERNEL_PARAM_TYPE kernelParams[] = { CUDA_KERNEL_PARAM(bpDescd) };
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_WRITEOUT_ACTIVE_HISTOGRAM, PxgBPKernelGridDim::BP_WRITEOUT_ACTIVE_HISTOGRAM, 1, 1, PxgBPKernelBlockDim::BP_WRITEOUT_ACTIVE_HISTOGRAM, 1, 1, 0, EPILOG);
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_COMPUTE_ACTIVE_HISTOGRAM, PxgBPKernelGridDim::BP_COMPUTE_ACTIVE_HISTOGRAM, 1, 1, PxgBPKernelBlockDim::BP_COMPUTE_ACTIVE_HISTOGRAM, 1, 1, 0, EPILOG);
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_OUTPUT_ACTIVE_HISTOGRAM, PxgBPKernelGridDim::BP_OUTPUT_ACTIVE_HISTOGRAM, 1, 1, PxgBPKernelBlockDim::BP_OUTPUT_ACTIVE_HISTOGRAM, 1, 1, 0, EPILOG);
}
}
void PxgCudaBroadPhaseSap::performIncrementalSapKernel()
{
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.performIncrementalSapKernel", mContextID);
//if(mUpdatedHandleSize != 0) // PT: TODO: why was this removed?
{
CUdeviceptr bpDescd = mBPDescBuf.getDevicePtr();
KERNEL_PARAM_TYPE kernelParams[] = { CUDA_KERNEL_PARAM(bpDescd) };
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_COMPUTE_INCREMENTAL_CMP_COUNTS1, PxgBPKernelGridDim::BP_COMPUTE_INCREMENTAL_CMP_COUNTS1, 1, 1, PxgBPKernelBlockDim::BP_COMPUTE_INCREMENTAL_CMP_COUNTS1, 1, 1, 0, EPILOG);
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_COMPUTE_INCREMENTAL_CMP_COUNTS2, PxgBPKernelGridDim::BP_COMPUTE_INCREMENTAL_CMP_COUNTS2, 1, 1, PxgBPKernelBlockDim::BP_COMPUTE_INCREMENTAL_CMP_COUNTS2, 1, 1, 0, EPILOG);
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_INCREMENTAL_SAP, PxgBPKernelGridDim::BP_INCREMENTAL_SAP, 3, 1, PxgBPKernelBlockDim::BP_INCREMENTAL_SAP, 1, 1, 0, EPILOG);
}
}
void PxgCudaBroadPhaseSap::generateNewPairsKernel()
{
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.generateNewPairsKernel", mContextID);
if(mUpdateData_CreatedHandleSize)
{
//Need to generate pairs for created handles...
CUdeviceptr bpDescd = mBPDescBuf.getDevicePtr();
KERNEL_PARAM_TYPE kernelParams[] = { CUDA_KERNEL_PARAM(bpDescd) };
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_WRITEOUT_OVERLAPCHECKS_HISTOGRAM_NEWBOUNDS, PxgBPKernelGridDim::BP_WRITEOUT_OVERLAPCHECKS_HISTOGRAM_NEWBOUNDS, 1, 1, PxgBPKernelBlockDim::BP_WRITEOUT_OVERLAPCHECKS_HISTOGRAM_NEWBOUNDS, 1, 1, 0, EPILOG);
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_COMPUTE_OVERLAPCHECKS_HISTOGRAM, PxgBPKernelGridDim::BP_COMPUTE_OVERLAPCHECKS_HISTOGRAM, 1, 1, PxgBPKernelBlockDim::BP_COMPUTE_OVERLAPCHECKS_HISTOGRAM, 1, 1, 0, EPILOG);
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_OUTPUT_OVERLAPCHECKS_HISTOGRAM, PxgBPKernelGridDim::BP_OUTPUT_OVERLAPCHECKS_HISTOGRAM, 1, 1, PxgBPKernelBlockDim::BP_OUTPUT_OVERLAPCHECKS_HISTOGRAM, 1, 1, 0, EPILOG);
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_GENERATE_FOUNDPAIR_NEWBOUNDS, PxgBPKernelGridDim::BP_GENERATE_FOUNDPAIR_NEWBOUNDS, 1, 1, PxgBPKernelBlockDim::BP_GENERATE_FOUNDPAIR_NEWBOUNDS, 1, 1, 0, EPILOG);
}
GPU_DEBUG_STREAM(mStream, "GPU generate new pairs fail!!\n")
}
void PxgCudaBroadPhaseSap::clearNewFlagKernel()
{
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.clearNewFlagKernel", mContextID);
if(mUpdateData_CreatedHandleSize)
{
CUdeviceptr bpDescd = mBPDescBuf.getDevicePtr();
KERNEL_PARAM_TYPE kernelParams[] = { CUDA_KERNEL_PARAM(bpDescd) };
_launch<GPU_BP_DEBUG>(PROLOG, PxgKernelIds::BP_CLEAR_NEWFLAG, PxgBPKernelGridDim::BP_CLEAR_NEWFLAG, 1, 1, PxgBPKernelBlockDim::BP_CLEAR_NEWFLAG, 1, 1, 0, EPILOG);
}
}
void PxgCudaBroadPhaseSap::updateRadixSortDesc(PxgRadixSortDesc* rsDescs)
{
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.updateRadixSortDesc", mContextID);
for (PxU32 i = 0; i < 3; ++i)
{
PxU32 offIndex = i+3;
CUdeviceptr inputKeyd = mBoxPtProjectionsBuf[i].getDevicePtr();
CUdeviceptr inputRankd = mBoxProjectionRanksBuf[i].getDevicePtr();
CUdeviceptr outputKeyd = mTempBoxPtProjectionBuf[i].getDevicePtr();
CUdeviceptr outputRankd = mTempBoxPtHandlesBuf[i].getDevicePtr();
CUdeviceptr radixCountd = mRadixCountBuf[i].getDevicePtr();
rsDescs[i].inputKeys = reinterpret_cast<PxU32*>(inputKeyd);
rsDescs[i].inputRanks = reinterpret_cast<PxU32*>(inputRankd);
rsDescs[i].outputKeys = reinterpret_cast<PxU32*>(outputKeyd);
rsDescs[i].outputRanks = reinterpret_cast<PxU32*>(outputRankd);
rsDescs[i].radixBlockCounts = reinterpret_cast<PxU32*>(radixCountd);
rsDescs[offIndex].outputKeys = reinterpret_cast<PxU32*>(inputKeyd);
rsDescs[offIndex].outputRanks = reinterpret_cast<PxU32*>(inputRankd);
rsDescs[offIndex].inputKeys = reinterpret_cast<PxU32*>(outputKeyd);
rsDescs[offIndex].inputRanks = reinterpret_cast<PxU32*>(outputRankd);
rsDescs[offIndex].radixBlockCounts = reinterpret_cast<PxU32*>(radixCountd);
CUdeviceptr inputVald = mBoxPtHandlesBuf[i].getDevicePtr();
mRSDescWOR[i].inputKeys = reinterpret_cast<PxU32*>(inputKeyd);
mRSDescWOR[i].inputRanks = reinterpret_cast<PxU32*>(inputVald);
mRSDescWOR[i].outputKeys = reinterpret_cast<PxU32*>(outputKeyd);
mRSDescWOR[i].outputRanks = reinterpret_cast<PxU32*>(outputRankd);
mRSDescWOR[i].radixBlockCounts = reinterpret_cast<PxU32*>(radixCountd);
mRSDescWOR[offIndex].outputKeys = reinterpret_cast<PxU32*>(inputKeyd);
mRSDescWOR[offIndex].outputRanks = reinterpret_cast<PxU32*>(inputVald);
mRSDescWOR[offIndex].inputKeys = reinterpret_cast<PxU32*>(outputKeyd);
mRSDescWOR[offIndex].inputRanks = reinterpret_cast<PxU32*>(outputRankd);
mRSDescWOR[offIndex].radixBlockCounts = reinterpret_cast<PxU32*>(radixCountd);
}
}
void PxgCudaBroadPhaseSap::updateDescriptor(PxgBroadPhaseDesc& desc)
{
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.updateDescriptor", mContextID);
// PT: there's some evil coupling between the BP and mAABBManager here. The CUDA code does use
// these buffers (e.g. volumeData) so effectively the CUDA BP cannot be used alone without the Pxg AABB manager.
// PT: added this here
PxMemZero(&desc, sizeof(PxgBroadPhaseDesc));
const PxU32 previousBoxes = mNumOfBoxes + mUpdateData_RemovedHandleSize - mUpdateData_CreatedHandleSize;
desc.updateData_createdHandles = reinterpret_cast<PxU32*>(mCreatedHandlesBuf.getDevicePtr());
desc.numCreatedHandles = mUpdateData_CreatedHandleSize;
desc.updateData_removedHandles = reinterpret_cast<PxU32*>(mRemovedHandlesBuf.getDevicePtr());
desc.numRemovedHandles = mUpdateData_RemovedHandleSize;
// PT: TODO: replace with adapter? I think this won't work without the bitmaps anyway?
if(mAABBManager)
{
// PT: this data is used in:
// - markUpdatedPairsLaunch (BP_UPDATE_UPDATEDPAIRS)
{
desc.aabbMngr_changedHandleMap = reinterpret_cast<PxU32*>(mAABBManager->getChangedAABBMgrHandles());
desc.aabbMngr_changedHandleBitMapWordCounts = mAABBManager->getChangedAABBMgActorHandleMap().getWordCount();
desc.aabbMngr_addedHandleMap = reinterpret_cast<PxU32*>(mAABBManager->getAddedHandles());
desc.aabbMngr_removedHandleMap = reinterpret_cast<PxU32*>(mAABBManager->getRemovedHandles());
desc.aabbMngr_aggregatedBoundHandles = reinterpret_cast<PxU32*>(mAABBManager->getAggregatedBounds());
}
// PT: this data is used in:
// - doAggPairCollisions (AGG_PAIR_COLLISION)
// - accumulateReportsStage_1 (BP_ACCUMULATE_REPORT_STAGE_1)
// - accumulateReportsStage_2 (BP_ACCUMULATE_REPORT_STAGE_2)
desc.aabbMngr_volumeData = reinterpret_cast<Bp::VolumeData*>(mAABBManager->mVolumDataBuf.getDevicePtr());
}
#ifdef SUPPORT_UPDATE_HANDLES_ARRAY_FOR_GPU
else
{
// PT: the GPU AABB manager never passes updated handles, the list is always empty! In this codepath
// (used for standalone BPs) we make it work again with an array of updated handles.
desc.updateData_updatedHandles = reinterpret_cast<PxU32*>(mUpdatedHandlesBuf.getDevicePtr());
desc.numUpdatedHandles = mUpdateData_UpdatedHandleSize;
//printf("%d\n", desc.numUpdatedHandles);
}
#endif
desc.oldIntegerBounds = reinterpret_cast<PxgIntegerAABB*>(mOldIntegerBoundsBuf.getDevicePtr());
desc.newIntegerBounds = reinterpret_cast<PxgIntegerAABB*>(mNewIntegerBoundsBuf.getDevicePtr());
desc.updateData_fpBounds = reinterpret_cast<PxBounds3*>(mBoxFpBoundsBuf.getDevicePtr());
desc.updateData_contactDistances = reinterpret_cast<PxReal*>(mBoxContactDistancesBuf.getDevicePtr());
desc.updateData_groups = reinterpret_cast<PxU32*>(mBoxGroupsBuf.getDevicePtr());
desc.updateData_envIDs = reinterpret_cast<PxU32*>(mBoxEnvIDsBuf.getDevicePtr());
desc.numPreviousHandles = previousBoxes;
//desc.numHandles = mNumOfBoxes;
desc.foundPairReport = reinterpret_cast<PxgBroadPhasePair*>(mFoundPairsBuf.getDevicePtr());
desc.lostPairReport = reinterpret_cast<PxgBroadPhasePair*>(mLostPairsBuf.getDevicePtr());
desc.foundAggPairReport = reinterpret_cast<PxgBroadPhasePair*>(mFoundAggregateBuf.getDevicePtr());
desc.lostAggPairReport = reinterpret_cast<PxgBroadPhasePair*>(mLostAggregateBuf.getDevicePtr());
desc.foundActorPairReport = reinterpret_cast<PxgBroadPhasePair*>(mFoundActorBuf.getDevicePtr());
desc.lostActorPairReport = reinterpret_cast<PxgBroadPhasePair*>(mLostActorBuf.getDevicePtr());
desc.foundPairReportMap = reinterpret_cast<PxgBroadPhasePair*>(getMappedDevicePtr(mCudaContext, mFoundActorPairs.begin()));
desc.lostPairReportMap = reinterpret_cast<PxgBroadPhasePair*>(getMappedDevicePtr(mCudaContext, mLostActorPairs.begin()));
for (PxU32 i = 0; i < 3; ++i)
{
/*const PxU32 offset = i*nbProjections;
desc.boxProjectionRanks[i] = projRanksGpuPtr + offset;*/
desc.boxSapBox1D[i] = reinterpret_cast<PxgSapBox1D*>(mBoxSapBox1DBuf[i].getDevicePtr());
desc.boxNewSapBox1D[i] = reinterpret_cast<PxgSapBox1D*>(mNewBoxSapBox1DBuf[i].getDevicePtr());
desc.boxProjectionRanks[i] = reinterpret_cast<PxU32*>(mBoxProjectionRanksBuf[i].getDevicePtr());
desc.boxProjections[i] = reinterpret_cast<PxU32*>(mBoxPtProjectionsBuf[i].getDevicePtr());
desc.boxHandles[0][i] = reinterpret_cast<PxU32*>(mBoxPtHandlesBuf[i].getDevicePtr());
desc.boxHandles[1][i] = reinterpret_cast<PxU32*>(mBoxPtHandlesBuf[i+3].getDevicePtr());
{
desc.totalEndPtHistogram[i] = reinterpret_cast<PxU32*>(mTotalEndPtHistogramBuf[i].getDevicePtr());
desc.blockTotalEndPtHistogram[i] = reinterpret_cast<PxU32*>(mBlockTotalEndPtHistogramBuf[i].getDevicePtr());
for (PxU32 j = 0; j < 2; ++j)
{
//desc.boxHandles[j][i] = handleGpuPtr[j] + i*projectionCount[j];
const PxU32 index = j * 3 + i;
desc.endPtHistogram[j][i] = reinterpret_cast<PxU32*>(mEndPtHistogramBuf[index].getDevicePtr());
desc.blockEndPtHistogram[j][i] = reinterpret_cast<PxU32*>(mBlockEndPtHistogramBuf[index].getDevicePtr());
desc.startPtHistogram[j][i] = reinterpret_cast<PxU32*>(mStartPtHistogramBuf[index].getDevicePtr());
desc.blockStartPtHistogram[j][i] = reinterpret_cast<PxU32*>(mBlockStartPtHistogramBuf[index].getDevicePtr());
desc.endPointHandles[j][i] = reinterpret_cast<PxU32*>(mEndPtHandleBuf[index].getDevicePtr());
desc.startPointHandles[j][i] = reinterpret_cast<PxU32*>(mStartPtHandleBuf[index].getDevicePtr());
}
desc.incrementalComparisons[i] = reinterpret_cast<PxU32*>(mIncrementalComparisons[i].getDevicePtr());
desc.incrementalBlockComparisons[i] = reinterpret_cast<PxU32*>(mIncrementalBlockComparisons[i].getDevicePtr());
}
}
for (PxU32 i = 0; i < 2; ++i)
{
desc.aggReportBlock[i] = reinterpret_cast<PxU32*>(mAggregateReportBlockBuf[i].getDevicePtr());
desc.actorReportBlock[i] = reinterpret_cast<PxU32*>(mActorReportBlockBuf[i].getDevicePtr());
}
desc.activeRegionsHistogram = reinterpret_cast<PxU32*>(mActiveRegionTotalBuf.getDevicePtr());
desc.startRegionsHistogram = reinterpret_cast<PxU32*>(mStartRegionsTotalBuf.getDevicePtr());
desc.orderedActiveRegionHandles = reinterpret_cast<PxU32*>(mOrderedActiveRegionHandlesTotalBuf.getDevicePtr());
desc.orderedStartRegionHandles = reinterpret_cast<PxU32*>(mOrderedStartRegionHandlesTotalBuf.getDevicePtr());
desc.blockOverlapChecksRegion = reinterpret_cast<regionOverlapType*>(mBlockOverlapChecksRegionBuf.getDevicePtr());
desc.overlapChecksRegion = reinterpret_cast<regionOverlapType*>(mOverlapChecksRegionBuf.getDevicePtr());
desc.overlapChecksHandleRegiones = reinterpret_cast<PxgHandleRegion*>(mOverlapChecksHandleRegionBuf.getDevicePtr());
desc.regionRange = reinterpret_cast<PxgIntegerRegion*>(mRegionRangeBuf.getDevicePtr());
desc.startRegionAccum = reinterpret_cast<PxU32*>(mStartRegionAccumBuf.getDevicePtr());
desc.blockStartRegionAccum = reinterpret_cast<PxU32*>(mBlockStartRegionAccumBuf.getDevicePtr());
desc.regionAccum = reinterpret_cast<PxU32*>(mRegionAccumBuf.getDevicePtr());
desc.blockRegionAccum = reinterpret_cast<PxU32*>(mBlockRegionAccumBuf.getDevicePtr());
desc.sharedFoundPairIndex = 0;
desc.sharedLostPairIndex = 0;
desc.sharedFoundAggPairIndex = 0;
desc.sharedLostAggPairIndex = 0;
desc.startRegionAccumTotal = 0;
desc.regionAccumTotal = mRegionAccumTotal;
desc.overlapChecksTotalRegion = mOverlapChecksTotalRegion;
desc.max_found_lost_pairs = mMaxFoundLostPairs;
desc.found_lost_pairs_overflow_flags = false;
desc.max_found_lost_agg_pairs = mMaxAggFoundLostPairs;
}
void PxgCudaBroadPhaseSap::update(PxcScratchAllocator* /*scratchAllocator*/, const Bp::BroadPhaseUpdateData& updateData, PxBaseTask* /*continuation*/)
{
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.update", mContextID);
// PT: TODO: this function is now the only place left using getGpuStateChanged() and getStateChanged()
// PT: TODO: could we move this outside of the update call to sever this last connection?
*mPinnedEvent = 0;
PxScopedCudaLock _lock_(*mCudaContextManager);
const PxU32 previousNumOfBoxes = mNumOfBoxes;
gpuDMAUp(updateData, *mBpDesc, mRSDesc);
const bool gpuStateChanged = updateData.getGpuStateChanged();
bool forcedUpdate = false;
if((updateData.getNumCreatedHandles() + updateData.getNumRemovedHandles()) == 0 && !updateData.getStateChanged() && !gpuStateChanged)
{
if (mForceUpdate)
{
forcedUpdate = true;
//We force a single update after everything has gone to sleep to force through some
//properties like the double-buffered bounds swap. If we don't do this, then some of the
//GPU aggregate logic can fail.
}
else
return;
}
mForceUpdate = !forcedUpdate;
// PT: not all kernels are needed for all cases (added / updated / removed).
// For an easier time analysing the code, it can be good to only trace the code needed for
// "one shot queries", ignoring the bits needed for updated & removed objects.
const bool oneShotQuery = false;
// translate from FP bounds to integer bounds
translateAABBsKernel();
if(!oneShotQuery)
{
//we mark pairs as removed but we didn't change their projections in the previous frame's box handles
markRemovedPairsKernel();
//we mark pairs as updated(update projections) in the current frame's box handles
if(gpuStateChanged)
markUpdatedPairsKernel();
//sort projections in the current frame and produce ranks which is used in the incremental sap to simulate swap.
//Also, we need to use the ranks to update the current frame's box handle
sortProjectionAndHandlesWRKernel(previousNumOfBoxes);
//histogram for projections end pointsm
calculateEndPtHistogramKernel(true);
//perform incremental sap to produce pairs for updated pairs and lost pairs
if(gpuStateChanged)
performIncrementalSapKernel();
}
if(mUpdateData_RemovedHandleSize)
{
//we need to recalculate the sap box because we resort the projections and handles based on the updated pairs
initializeSapBoxKernel(previousNumOfBoxes, false);
//we need to reset the projections for the removed pairs so that they can be shuffled to the end of the array after sort
markRemovedPairsProjectionsKernel();
}
if(mUpdateData_CreatedHandleSize || mUpdateData_RemovedHandleSize)
{
markCreatedPairsKernel();
sortProjectionAndHandlesWORKernel(previousNumOfBoxes);
calculateEndPtHistogramKernel(false);
}
initializeSapBoxKernel(mNumOfBoxes, false);
computeRegionHistogramKernel();
computeStartAndActiveHistogramKernel();
generateNewPairsKernel();
if(!oneShotQuery)
PxgCudaBuffer::swapBuffer(mNewIntegerBoundsBuf, mOldIntegerBoundsBuf);
clearNewFlagKernel();
runCopyResultsKernel(*mBpDesc);
//mCudaContext->streamFlush(mStream);
}
// PT: called from PxgAABBManager::preBpUpdate_GPU
void PxgCudaBroadPhaseSap::preBroadPhase(const Bp::BroadPhaseUpdateData& updateData)
{
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.preBroadPhase", mContextID);
PxScopedCudaLock _lock_(*mCudaContextManager);
//mPreviousNumOfBoxes = mNumOfBoxes;
//gpuDMAUp(updateData, *mBpDesc, mRSDesc);
// PT: the code below used to be in "gpuDmaUpSharedData"
const PxU32 capacity = updateData.getCapacity();
mUpdateData_BoxesCapacity = capacity;
const PxU32 boundsSize = capacity * sizeof(PxBounds3);
const PxU32 distanceSize = capacity * sizeof(PxReal);
const PxU32 groupSize = capacity * sizeof(PxU32);
const PxU32 envIDSize = capacity * sizeof(PxU32);
mBoxContactDistancesBuf.allocate(distanceSize, PX_FL);
mBoxGroupsBuf.allocate(groupSize, PX_FL);
if(updateData.getEnvIDs())
mBoxEnvIDsBuf.allocate(envIDSize, PX_FL);
if(updateData.getStateChanged()) // PT: otherwise the call should have been skipped
{
mBoxFpBoundsBuf.allocate(boundsSize, PX_FL);
mCudaContext->memcpyHtoDAsync(mBoxFpBoundsBuf.getDevicePtr(), updateData.getAABBs(), boundsSize, mStream);
}
mCudaContext->memcpyHtoDAsync(mBoxContactDistancesBuf.getDevicePtr(), updateData.getContactDistance(), distanceSize, mStream);
mCudaContext->memcpyHtoDAsync(mBoxGroupsBuf.getDevicePtr(), updateData.getGroups(), groupSize, mStream);
if(updateData.getEnvIDs())
mCudaContext->memcpyHtoDAsync(mBoxEnvIDsBuf.getDevicePtr(), updateData.getEnvIDs(), envIDSize, mStream);
}
void PxgCudaBroadPhaseSap::fetchBroadPhaseResults()
{
PX_PROFILE_ZONE("PxgCudaBroadPhaseSap.fetchBroadPhaseResults", mContextID);
PxScopedCudaLock _lock_(*mCudaContextManager);
gpuDMABack(*mBpDesc);
//purgeDuplicateFoundPairs();
//purgeDuplicateLostPairs();
// flip double buffer
{
for(PxU32 i=0; i<3; ++i)
{
const PxU32 swapId = i + 3;
PxgCudaBuffer::swapBuffer(mBoxPtHandlesBuf[i], mBoxPtHandlesBuf[swapId]);
PxgCudaBuffer::swapBuffer(mBlockEndPtHistogramBuf[i], mBlockEndPtHistogramBuf[swapId]);
PxgCudaBuffer::swapBuffer(mEndPtHistogramBuf[i], mEndPtHistogramBuf[swapId]);
PxgCudaBuffer::swapBuffer(mEndPtHandleBuf[i], mEndPtHandleBuf[swapId]);
PxgCudaBuffer::swapBuffer(mBlockStartPtHistogramBuf[i], mBlockStartPtHistogramBuf[swapId]);
PxgCudaBuffer::swapBuffer(mStartPtHistogramBuf[i], mStartPtHistogramBuf[swapId]);
PxgCudaBuffer::swapBuffer(mStartPtHandleBuf[i], mStartPtHandleBuf[swapId]);
}
}
}