// Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved. // Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. // Copyright (c) 2001-2004 NovodeX AG. All rights reserved. #include "foundation/PxProfiler.h" #include "foundation/PxMemory.h" #include "foundation/PxBitUtils.h" #include "foundation/PxFPU.h" #include "BpBroadPhaseABP.h" #include "BpBroadPhaseShared.h" #include "foundation/PxVecMath.h" #include "PxcScratchAllocator.h" #include "common/PxProfileZone.h" #include "CmRadixSort.h" #include "CmUtils.h" #include "GuBounds.h" #include "foundation/PxThread.h" #include "foundation/PxSync.h" #include "task/PxTask.h" using namespace physx; using namespace aos; using namespace Bp; using namespace Cm; /* PT: to try: - prepare data: sort & compute bounds in parallel? or just MT the last loop? - switch post update & add delayed pairs? - MT computeCreatedDeletedPairs - why do we set the update flag for added/removed objects? - use timestamps instead of bits? */ #define ABP_MT #define CHECKPOINT(x) //#include //#define CHECKPOINT(x) printf(x); //#pragma warning (disable : 4702) #define CODEALIGN16 //_asm align 16 #if PX_INTEL_FAMILY && !defined(PX_SIMD_DISABLED) #define ABP_SIMD_OVERLAP #endif //#define ABP_BATCHING 128 #define ABP_BATCHING 256 //#define USE_ABP_BUCKETS 5000 // PT: don't use buckets below that number... #define USE_ABP_BUCKETS 512 // PT: don't use buckets below that number... //#define USE_ABP_BUCKETS 64 // PT: don't use buckets below that number... #ifdef USE_ABP_BUCKETS #define NB_BUCKETS 5 // Regular version: 5 buckets a la bucket pruner (4 + cross bucket) // Alternative version: 4 buckets + dup objects a la MBP regions // #define USE_ALTERNATIVE_VERSION #define ABP_USE_INTEGER_XS2 // Works but questionable speedups #else #define ABP_USE_INTEGER_XS #endif #define NB_SENTINELS 6 //#define RECURSE_LIMIT 20000 typedef PxU32 ABP_Index; static const bool gPrepareOverlapsFlag = true; #ifdef ABP_SIMD_OVERLAP static const bool gUseRegularBPKernel = false; // false to use "version 13" in box pruning series static const bool gUnrollLoop = true; // true to use "version 14" in box pruning series #else // PT: tested on Switch, for some reason the regular version is fastest there static const bool gUseRegularBPKernel = true; // false to use "version 13" in box pruning series static const bool gUnrollLoop = false; // true to use "version 14" in box pruning series //ABP_SIMD_OVERLAP //MBP.Add64KObjects 13982 ( +0.0%) 4757795 ( +0.0%) FAIL //MBP.AddBroadPhaseRegion 0 ( +0.0%) 3213795 ( +0.0%) FAIL //MBP.FinalizeOverlaps64KObjects 507 ( +0.0%) 5650723 ( +0.0%) FAIL //MBP.FindOverlaps64KMixedObjects 59258 ( +0.0%) 5170179 ( +0.0%) FAIL //MBP.FindOverlaps64KObjects 31351 ( +0.0%) 7122019 ( +0.0%) FAIL //MBP.Remove64KObjects 4993 ( +0.0%) 5281683 ( +0.0%) FAIL //MBP.Update64KObjects 13711 ( +0.0%) 5521699 ( +0.0%) FAIL //gUseRegularBPKernel: //MBP.Add64KObjects 14406 ( +0.0%) 4757795 ( +0.0%) FAIL //MBP.AddBroadPhaseRegion 0 ( +0.0%) 3213795 ( +0.0%) FAIL //MBP.FinalizeOverlaps64KObjects 504 ( +0.0%) 5650723 ( +0.0%) FAIL //MBP.FindOverlaps64KMixedObjects 48929 ( +0.0%) 5170179 ( +0.0%) FAIL //MBP.FindOverlaps64KObjects 25636 ( +0.0%) 7122019 ( +0.0%) FAIL //MBP.Remove64KObjects 4878 ( +0.0%) 5281683 ( +0.0%) FAIL //MBP.Update64KObjects 13932 ( +0.0%) 5521699 ( +0.0%) FAIL // false/true //MBP.Add64KObjects 14278 ( +0.0%) 4757795 ( +0.0%) FAIL //MBP.AddBroadPhaseRegion 0 ( +0.0%) 3213795 ( +0.0%) FAIL //MBP.FinalizeOverlaps64KObjects 504 ( +0.0%) 5650723 ( +0.0%) FAIL //MBP.FindOverlaps64KMixedObjects 60331 ( +0.0%) 5170179 ( +0.0%) FAIL //MBP.FindOverlaps64KObjects 32064 ( +0.0%) 7122019 ( +0.0%) FAIL //MBP.Remove64KObjects 4930 ( +0.0%) 5281683 ( +0.0%) FAIL //MBP.Update64KObjects 13673 ( +0.0%) 5521699 ( +0.0%) FAIL // false/false //MBP.Add64KObjects 13960 ( +0.0%) 4757795 ( +0.0%) FAIL //MBP.AddBroadPhaseRegion 0 ( +0.0%) 3213795 ( +0.0%) FAIL //MBP.FinalizeOverlaps64KObjects 503 ( +0.0%) 5650723 ( +0.0%) FAIL //MBP.FindOverlaps64KMixedObjects 48549 ( +0.0%) 5170179 ( +0.0%) FAIL //MBP.FindOverlaps64KObjects 25598 ( +0.0%) 7122019 ( +0.0%) FAIL //MBP.Remove64KObjects 4883 ( +0.0%) 5281683 ( +0.0%) FAIL //MBP.Update64KObjects 13667 ( +0.0%) 5521699 ( +0.0%) FAIL #endif #ifdef ABP_USE_INTEGER_XS typedef PxU32 PosXType; #define SentinelValue 0xffffffff #else typedef float PosXType; #define SentinelValue FLT_MAX #endif #ifdef ABP_USE_INTEGER_XS2 typedef PxU32 PosXType2; #define SentinelValue2 0xffffffff #else #ifdef ABP_USE_INTEGER_XS typedef PxU32 PosXType2; #define SentinelValue2 0xffffffff #else typedef float PosXType2; #define SentinelValue2 FLT_MAX #endif #endif namespace internalABP { struct SIMD_AABB4 : public PxUserAllocated { PX_FORCE_INLINE void initFrom2(const PxBounds3& box) { #ifdef ABP_USE_INTEGER_XS mMinX = encodeFloat(PX_IR(box.minimum.x)); mMaxX = encodeFloat(PX_IR(box.maximum.x)); mMinY = box.minimum.y; mMinZ = box.minimum.z; mMaxY = box.maximum.y; mMaxZ = box.maximum.z; #else mMinX = box.minimum.x; mMinY = box.minimum.y; mMinZ = box.minimum.z; mMaxX = box.maximum.x; mMaxY = box.maximum.y; mMaxZ = box.maximum.z; #endif } PX_FORCE_INLINE void operator = (const SIMD_AABB4& box) { mMinX = box.mMinX; mMinY = box.mMinY; mMinZ = box.mMinZ; mMaxX = box.mMaxX; mMaxY = box.mMaxY; mMaxZ = box.mMaxZ; } PX_FORCE_INLINE void initSentinel() { mMinX = SentinelValue; } PX_FORCE_INLINE bool isSentinel() const { return mMinX == SentinelValue; } #ifdef USE_ABP_BUCKETS // PT: to be able to compute bounds easily PosXType mMinX; float mMinY; float mMinZ; PosXType mMaxX; float mMaxY; float mMaxZ; #else PosXType mMinX; PosXType mMaxX; float mMinY; float mMinZ; float mMaxY; float mMaxZ; #endif }; #define USE_SHARED_CLASSES #ifdef USE_SHARED_CLASSES struct SIMD_AABB_X4 : public AABB_Xi { PX_FORCE_INLINE void initFrom(const SIMD_AABB4& box) { #ifdef ABP_USE_INTEGER_XS2 initFromFloats(&box.mMinX, &box.mMaxX); #else mMinX = box.mMinX; mMaxX = box.mMaxX; #endif } }; PX_ALIGN_PREFIX(16) #ifdef ABP_SIMD_OVERLAP struct SIMD_AABB_YZ4 : AABB_YZn { PX_FORCE_INLINE void initFrom(const SIMD_AABB4& box) { #ifdef ABP_SIMD_OVERLAP mMinY = -box.mMinY; mMinZ = -box.mMinZ; #else mMinY = box.mMinY; mMinZ = box.mMinZ; #endif mMaxY = box.mMaxY; mMaxZ = box.mMaxZ; } } #else struct SIMD_AABB_YZ4 : AABB_YZr { PX_FORCE_INLINE void initFrom(const SIMD_AABB4& box) { mMinY = box.mMinY; mMinZ = box.mMinZ; mMaxY = box.mMaxY; mMaxZ = box.mMaxZ; } } #endif PX_ALIGN_SUFFIX(16); #else struct SIMD_AABB_X4 : public PxUserAllocated { PX_FORCE_INLINE void initFromFloats(const void* PX_RESTRICT minX, const void* PX_RESTRICT maxX) { mMinX = encodeFloat(*reinterpret_cast(minX)); mMaxX = encodeFloat(*reinterpret_cast(maxX)); } PX_FORCE_INLINE void initFrom(const SIMD_AABB4& box) { #ifdef ABP_USE_INTEGER_XS2 initFromFloats(&box.mMinX, &box.mMaxX); #else mMinX = box.mMinX; mMaxX = box.mMaxX; #endif } PX_FORCE_INLINE void initFromPxVec4(const PxVec4& min, const PxVec4& max) { #ifdef ABP_USE_INTEGER_XS2 initFromFloats(&min.x, &max.x); #else #ifdef ABP_USE_INTEGER_XS initFromFloats(&min.x, &max.x); #else mMinX = min.x; mMaxX = max.x; #endif #endif } PX_FORCE_INLINE void operator = (const SIMD_AABB_X4& box) { mMinX = box.mMinX; mMaxX = box.mMaxX; } PX_FORCE_INLINE void initSentinel() { mMinX = SentinelValue2; } PX_FORCE_INLINE bool isSentinel() const { return mMinX == SentinelValue2; } PosXType2 mMinX; PosXType2 mMaxX; }; struct SIMD_AABB_YZ4 : public PxUserAllocated { PX_FORCE_INLINE void initFrom(const SIMD_AABB4& box) { #ifdef ABP_SIMD_OVERLAP mMinY = -box.mMinY; mMinZ = -box.mMinZ; #else mMinY = box.mMinY; mMinZ = box.mMinZ; #endif mMaxY = box.mMaxY; mMaxZ = box.mMaxZ; } PX_FORCE_INLINE void initFromPxVec4(const PxVec4& min, const PxVec4& max) { #ifdef ABP_SIMD_OVERLAP mMinY = -min.y; mMinZ = -min.z; #else mMinY = min.y; mMinZ = min.z; #endif mMaxY = max.y; mMaxZ = max.z; } PX_FORCE_INLINE void operator = (const SIMD_AABB_YZ4& box) { V4StoreA(V4LoadA(&box.mMinY), &mMinY); } float mMinY; float mMinZ; float mMaxY; float mMaxZ; }; #endif #define MBP_ALLOC(x) PX_ALLOC(x, "MBP") #define MBP_ALLOC_TMP(x) PX_ALLOC(x, "MBP_TMP") #define MBP_FREE(x) PX_FREE(x) #define INVALID_ID 0xffffffff /////////////////////////////////////////////////////////////////////////////// #define DEFAULT_NB_ENTRIES 128 class ABP_MM { public: ABP_MM() : mScratchAllocator(NULL) {} ~ABP_MM() {} void* frameAlloc(PxU32 size); void frameFree(void* address); PxcScratchAllocator* mScratchAllocator; }; void* ABP_MM::frameAlloc(PxU32 size) { if(mScratchAllocator) return mScratchAllocator->alloc(size, true); return PX_ALLOC(size, "frameAlloc"); } void ABP_MM::frameFree(void* address) { if(mScratchAllocator) mScratchAllocator->free(address); else PX_FREE(address); } template static T* resizeBoxesT(PxU32 oldNbBoxes, PxU32 newNbBoxes, T* boxes) { T* newBoxes = reinterpret_cast(MBP_ALLOC(sizeof(T)*newNbBoxes)); if(oldNbBoxes) PxMemCopy(newBoxes, boxes, oldNbBoxes*sizeof(T)); MBP_FREE(boxes); return newBoxes; } class Boxes { public: Boxes(); ~Boxes(); PX_FORCE_INLINE void init(const Boxes& boxes){ mSize = boxes.mSize; mCapacity = boxes.mCapacity; } PX_FORCE_INLINE PxU32 getSize() const { return mSize; } PX_FORCE_INLINE PxU32 getCapacity() const { return mCapacity; } PX_FORCE_INLINE bool isFull() const { return mSize==mCapacity; } PX_FORCE_INLINE void reset() { mSize = mCapacity = 0; } PX_FORCE_INLINE PxU32 popBack() { return --mSize; } // protected: PxU32 mSize; PxU32 mCapacity; }; Boxes::Boxes() : mSize (0), mCapacity (0) { } Boxes::~Boxes() { reset(); } class StraightBoxes : public Boxes { public: StraightBoxes(); ~StraightBoxes(); void init(PxU32 size, PxU32 capacity, SIMD_AABB4* boxes); void reset(); PxU32 resize(); PxU32 resize(PxU32 incoming); bool allocate(PxU32 nb); PX_FORCE_INLINE const SIMD_AABB4* getBoxes() const { return mBoxes; } PX_FORCE_INLINE SIMD_AABB4* getBoxes() { return mBoxes; } PX_FORCE_INLINE void setBounds(PxU32 index, const SIMD_AABB4& box) { PX_ASSERT(index(boxes.getBoxes_X()); mBoxes_YZ = const_cast(boxes.getBoxes_YZ()); } PxU32 SplitBoxes::resize() { const PxU32 capacity = mCapacity; const PxU32 size = mSize; // const PxU32 newCapacity = capacity ? capacity + DEFAULT_NB_ENTRIES : DEFAULT_NB_ENTRIES; // const PxU32 newCapacity = capacity ? capacity*2 : DEFAULT_NB_ENTRIES; const PxU32 newCapacity = capacity ? capacity*2 : DEFAULT_NB_ENTRIES; mBoxes_X = resizeBoxesT(size, newCapacity, mBoxes_X); mBoxes_YZ = resizeBoxesT(size, newCapacity, mBoxes_YZ); mCapacity = newCapacity; return newCapacity; } PxU32 SplitBoxes::resize(PxU32 incoming) { const PxU32 capacity = mCapacity; const PxU32 size = mSize; const PxU32 minCapacity = size + incoming; if(minCapacity(MBP_ALLOC(sizeof(SIMD_AABB_X4)*(nb+NB_SENTINELS))); mBoxes_YZ = reinterpret_cast(MBP_ALLOC(sizeof(SIMD_AABB_YZ4)*nb)); PX_ASSERT(!(size_t(mBoxes_YZ) & 15)); mSize = mCapacity = nb; return true; } typedef SplitBoxes StaticBoxes; typedef SplitBoxes DynamicBoxes; /////////////////////////////////////////////////////////////////////////////// struct ABP_Object : public PxUserAllocated { PX_FORCE_INLINE ABP_Object() : mIndex(INVALID_ID) { #if PX_DEBUG mUpdated = false; #endif } private: PxU32 mIndex; // Out-to-in, maps user handle to internal array. mIndex indexes either the static or dynamic array. // PT: the type won't be available for removed objects so we have to store it there. That uses 2 bits. // Then the "data" will need one more bit for marking sleeping objects so that leaves 28bits for the actual index. PX_FORCE_INLINE void setData(PxU32 index, FilterType::Enum type) { // mIndex = index; index <<= 2; index |= type; mIndex = index; } public: // PT: TODO: rename "index" to data everywhere PX_FORCE_INLINE void setActiveIndex(PxU32 index, FilterType::Enum type) { const PxU32 boxData = (index+index); setData(boxData, type); } PX_FORCE_INLINE void setSleepingIndex(PxU32 index, FilterType::Enum type) { const PxU32 boxData = (index+index)|1; PX_ASSERT(getType()==type); setData(boxData, type); } PX_FORCE_INLINE FilterType::Enum getType() const { return FilterType::Enum(mIndex&3); } PX_FORCE_INLINE PxU32 getData() const { return mIndex>>2; } PX_FORCE_INLINE void invalidateIndex() { mIndex = INVALID_ID; } PX_FORCE_INLINE bool isValid() const { return mIndex != INVALID_ID; } #if PX_DEBUG bool mUpdated; #endif }; typedef ABP_Object ABPEntry; /////////////////////////////////////////////////////////////////////////////// //#define BIT_ARRAY_STACK 512 static PX_FORCE_INLINE PxU32 bitsToDwords(PxU32 nbBits) { return (nbBits>>5) + ((nbBits&31) ? 1 : 0); } // Use that one instead of an array of bools. Takes less ram, nearly as fast [no bounds checkings and so on]. class BitArray { public: BitArray(); BitArray(PxU32 nbBits); ~BitArray(); bool init(PxU32 nbBits); void empty(); void resize(PxU32 nbBits); PX_FORCE_INLINE void checkResize(PxU32 bitNumber) { const PxU32 index = bitNumber>>5; if(index>=mSize) resize(bitNumber); } PX_FORCE_INLINE void setBitChecked(PxU32 bitNumber) { const PxU32 index = bitNumber>>5; if(index>=mSize) resize(bitNumber); mBits[index] |= 1<<(bitNumber&31); } PX_FORCE_INLINE void clearBitChecked(PxU32 bitNumber) { const PxU32 index = bitNumber>>5; if(index>=mSize) resize(bitNumber); mBits[index] &= ~(1<<(bitNumber&31)); } // Data management PX_FORCE_INLINE void setBit(PxU32 bitNumber) { mBits[bitNumber>>5] |= 1<<(bitNumber&31); } PX_FORCE_INLINE void clearBit(PxU32 bitNumber) { mBits[bitNumber>>5] &= ~(1<<(bitNumber&31)); } PX_FORCE_INLINE void toggleBit(PxU32 bitNumber) { mBits[bitNumber>>5] ^= 1<<(bitNumber&31); } PX_FORCE_INLINE void clearAll() { PxMemZero(mBits, mSize*4); } PX_FORCE_INLINE void setAll() { PxMemSet(mBits, 0xff, mSize*4); } // Data access PX_FORCE_INLINE PxIntBool isSet(PxU32 bitNumber) const { return PxIntBool(mBits[bitNumber>>5] & (1<<(bitNumber&31))); } PX_FORCE_INLINE PxIntBool isSetChecked(PxU32 bitNumber) const { const PxU32 index = bitNumber>>5; if(index>=mSize) return 0; return PxIntBool(mBits[index] & (1<<(bitNumber&31))); } PX_FORCE_INLINE const PxU32* getBits() const { return mBits; } PX_FORCE_INLINE PxU32 getSize() const { return mSize; } protected: PxU32* mBits; //!< Array of bits PxU32 mSize; //!< Size of the array in dwords #ifdef BIT_ARRAY_STACK PxU32 mStack[BIT_ARRAY_STACK]; #endif }; /////////////////////////////////////////////////////////////////////////////// BitArray::BitArray() : mBits(NULL), mSize(0) { } BitArray::BitArray(PxU32 nbBits) : mBits(NULL), mSize(0) { init(nbBits); } BitArray::~BitArray() { empty(); } void BitArray::empty() { #ifdef BIT_ARRAY_STACK if(mBits!=mStack) #endif MBP_FREE(mBits); mBits = NULL; mSize = 0; } bool BitArray::init(PxU32 nbBits) { mSize = bitsToDwords(nbBits); // Get ram for n bits #ifdef BIT_ARRAY_STACK if(mBits!=mStack) #endif MBP_FREE(mBits); #ifdef BIT_ARRAY_STACK if(mSize>BIT_ARRAY_STACK) #endif mBits = reinterpret_cast(MBP_ALLOC(sizeof(PxU32)*mSize)); #ifdef BIT_ARRAY_STACK else mBits = mStack; #endif // Set all bits to 0 clearAll(); return true; } void BitArray::resize(PxU32 nbBits) { const PxU32 newSize = bitsToDwords(nbBits+128); PxU32* newBits = NULL; #ifdef BIT_ARRAY_STACK if(newSize>BIT_ARRAY_STACK) #endif { // Old buffer was stack or allocated, new buffer is allocated newBits = reinterpret_cast(MBP_ALLOC(sizeof(PxU32)*newSize)); if(mSize) PxMemCopy(newBits, mBits, sizeof(PxU32)*mSize); } #ifdef BIT_ARRAY_STACK else { newBits = mStack; if(mSize>BIT_ARRAY_STACK) { // Old buffer was allocated, new buffer is stack => copy to stack, shrink CopyMemory(newBits, mBits, sizeof(PxU32)*BIT_ARRAY_STACK); } else { // Old buffer was stack, new buffer is stack => keep working on the same stack buffer, nothing to do } } #endif const PxU32 remain = newSize - mSize; if(remain) PxMemZero(newBits + mSize, remain*sizeof(PxU32)); #ifdef BIT_ARRAY_STACK if(mBits!=mStack) #endif MBP_FREE(mBits); mBits = newBits; mSize = newSize; } /////////////////////////////////////////////////////////////////////////////// static ABP_Index* resizeMapping(PxU32 oldNbBoxes, PxU32 newNbBoxes, ABP_Index* mapping) { ABP_Index* newMapping = reinterpret_cast(MBP_ALLOC(sizeof(ABP_Index)*newNbBoxes)); if(oldNbBoxes) PxMemCopy(newMapping, mapping, oldNbBoxes*sizeof(ABP_Index)); MBP_FREE(mapping); return newMapping; } struct ABP_Object; #ifdef ABP_MT struct DelayedPair { PxU32 mID0; PxU32 mID1; PxU32 mHash; }; #endif class ABP_PairManager : public PairManagerData { public: ABP_PairManager(); ~ABP_PairManager(); InternalPair* addPair (PxU32 id0, PxU32 id1); void computeCreatedDeletedPairs (PxArray& createdPairs, PxArray& deletedPairs, const BitArray& updated, const BitArray& removed); #ifdef ABP_MT void addDelayedPair (PxArray& delayedPairs, const ABP_Index* mInToOut0, const ABP_Index* mInToOut1, PxU32 index0, PxU32 index1) const; void addDelayedPairs (const PxArray& delayedPairs); void addDelayedPairs2(PxArray& createdPairs, const PxArray& delayedPairs); void resizeForNewPairs(PxU32 nbDelayedPairs); #endif const Bp::FilterGroup::Enum* mGroups; const ABP_Index* mInToOut0; const ABP_Index* mInToOut1; const bool* mLUT; }; /////////////////////////////////////////////////////////////////////////// struct ABP_SharedData { PX_FORCE_INLINE ABP_SharedData() : mABP_Objects (NULL), mABP_Objects_Capacity (0) { } void resize(BpHandle userID); PX_FORCE_INLINE void checkResize(PxU32 maxID) { if(mABP_Objects_CapacitycurrentCapacity) { const PxU32 minCapacity = PxMax(newSize, 1024u); const PxU32 newCapacity = PxMax(minCapacity, currentCapacity*2); PX_ASSERT(newCapacity>=newSize); mMaxNbUpdated = newCapacity; remap = resizeMapping(currentSize, newCapacity, mInToOut_Updated); } else { remap = mInToOut_Updated; } mInToOut_Updated = remap; mNbUpdated = newSize; // PT: we only copy the new handles for now. The bounds will be computed later in "prepareData". // PT: TODO: do we even need to copy them? Can't we just reuse the source ptr directly? { PX_ASSERT(currentSize+nb<=mMaxNbUpdated); remap += currentSize; PxU32 nbToGo = nb; while(nbToGo--) { const BpHandle userID = *userIDs++; PX_ASSERT(!isNewOrUpdated(userID)); *remap++ = markAsNewOrUpdated(userID); if(sharedData) sharedData->mUpdatedObjects.setBit(userID); } } } // PT: TODO: inline this again void BoxManager::removeObject(ABPEntry& object, BpHandle userID) { PX_UNUSED(userID); const PxU32 boxData = object.getData(); const PxU32 boxIndex = boxData>>1; if(boxData&1) { // Sleeping object. PX_ASSERT(boxIndex>1; if(boxData&1) { // PT: benchmark for this codepath: MBP.UpdateSleeping // Sleeping object. We must reactivate it, i.e: // - remove it from the array of sleeping objects // - add it to the array of active/updated objects // First we remove: { PX_ASSERT(boxIndex also tweaking the sleeping boxes might break the "merge sleeping" array code PX_ASSERT(mNbRemovedSleeping<=mNbSleeping); if(mNbRemovedSleeping==mNbSleeping) { // PT: remove everything mSleepingBoxes.reset(); PX_FREE(mInToOut_Sleeping); mNbSleeping = mNbRemovedSleeping = 0; return; } const PxU32 expectedTotal = mNbSleeping - mNbRemovedSleeping; PxU32 nbRemovedFound = 0; PxU32 nbSleepingLeft = 0; const PxU32 sleepCapacity = mSleepingBoxes.getCapacity(); if(expectedTotal>=sleepCapacity/2) { // PT: remove holes, keep same data buffers SIMD_AABB_X4* boxesX = mSleepingBoxes.getBoxes_X(); SIMD_AABB_YZ4* boxesYZ = mSleepingBoxes.getBoxes_YZ(); ABP_Index* remap = mInToOut_Sleeping; for(PxU32 i=0;i(MBP_ALLOC(sizeof(SIMD_AABB_X4)*(expectedTotal+NB_SENTINELS))); SIMD_AABB_YZ4* dstBoxesYZ = reinterpret_cast(MBP_ALLOC(sizeof(SIMD_AABB_YZ4)*(expectedTotal+NB_SENTINELS))); initSentinels(dstBoxesX, expectedTotal); BpHandle* PX_RESTRICT dstRemap = reinterpret_cast(PX_ALLOC(expectedTotal*sizeof(BpHandle), "tmp")); const SIMD_AABB_X4* PX_RESTRICT srcDataX = mSleepingBoxes.getBoxes_X(); const SIMD_AABB_YZ4* PX_RESTRICT srcDataYZ = mSleepingBoxes.getBoxes_YZ(); const ABP_Index* PX_RESTRICT srcRemap = mInToOut_Sleeping; for(PxU32 i=0;i(memoryManager.frameAlloc(size*sizeof(PxU32))); newOrUpdatedIDs = tempBuffer; sleepingIndices = tempBuffer; } if(isNewOrUpdated(index)) { // PT: new or updated object if(!keys) keys = reinterpret_cast(PX_ALLOC(size*sizeof(float), "tmp")); // PT: in this version we compute the key on-the-fly, i.e. it will be computed twice overall. We could make this // faster by merging bounds and distances inside the AABB manager. const BpHandle userID = removeNewOrUpdatedMark(index); keys[nbUpdated] = bounds[userID].minimum.x - distances[userID]; newOrUpdatedIDs[size - 1 - nbUpdated] = userID; #if PX_DEBUG SIMD_AABB4 aabb; computeMBPBounds_Check(aabb, bounds, distances, userID); PX_ASSERT(aabb.mMinX==keys[nbUpdated]); #endif nbUpdated++; } else { // PT: sleeping object sleepingIndices[nbSleeping++] = i; } } } PX_ASSERT(nbRemoved + nbUpdated + nbSleeping == size); PX_UNUSED(nbRemoved); // PT: we must process the sleeping objects first, because the bounds of new sleeping objects are located in the existing updated buffers. // PT: TODO: *HOWEVER* we could sort things right now and then reuse the "keys" buffer? if(nbSleeping) { // PT: must merge these guys to current sleeping array // They should already be in sorted order and we should already have the boxes. #if PX_ENABLE_ASSERTS const SIMD_AABB_YZ4* boxesYZ = mUpdatedBoxes.getBoxes_YZ(); float prevKey = -FLT_MAX; for(PxU32 ii=0;ii=prevKey); prevKey = key; SIMD_AABB4 aabb; computeMBPBounds_Check(aabb, bounds, distances, userID); PX_ASSERT(aabb.mMinX==key); #ifdef ABP_SIMD_OVERLAP PX_ASSERT(boxesYZ[i].mMinY==-aabb.mMinY); PX_ASSERT(boxesYZ[i].mMinZ==-aabb.mMinZ); #else PX_ASSERT(boxesYZ[i].mMinY==aabb.mMinY); PX_ASSERT(boxesYZ[i].mMinZ==aabb.mMinZ); #endif PX_ASSERT(boxesYZ[i].mMaxY==aabb.mMaxY); PX_ASSERT(boxesYZ[i].mMaxZ==aabb.mMaxZ); } #endif if(mNbSleeping) { // PT: benchmark for this codepath: MBP.MergeSleeping CHECKPOINT("Merging sleeping objects\n"); // PT: here, we need to merge two arrays of sleeping objects together: // - the ones already contained inside mSleepingBoxes // - the new sleeping objects currently contained in mUpdatedBoxes // Both of them should already be sorted. // PT: TODO: super subtle stuff going on there, to revisit // PT: TODO: revisit names PxU32 offsetSorted = 0; const PxU32 nbSorted = nbSleeping; const SIMD_AABB_X4* PX_RESTRICT sortedDataX = mUpdatedBoxes.getBoxes_X(); const SIMD_AABB_YZ4* PX_RESTRICT sortedDataYZ = mUpdatedBoxes.getBoxes_YZ(); const ABP_Index* PX_RESTRICT sortedRemap = mInToOut_Updated; PxU32 offsetNonSorted = 0; const PxU32 nbToSort = mNbSleeping; const SIMD_AABB_X4* PX_RESTRICT toSortDataX = mSleepingBoxes.getBoxes_X(); const SIMD_AABB_YZ4* PX_RESTRICT toSortDataYZ = mSleepingBoxes.getBoxes_YZ(); const ABP_Index* PX_RESTRICT toSortRemap = mInToOut_Sleeping; PX_ASSERT(mNbRemovedSleeping<=mNbSleeping); #if PX_ENABLE_ASSERTS { PxU32 nbRemovedFound=0; for(PxU32 i=0;i(MBP_ALLOC(sizeof(SIMD_AABB_X4)*(nbTotal+NB_SENTINELS))); SIMD_AABB_YZ4* dstBoxesYZ = reinterpret_cast(MBP_ALLOC(sizeof(SIMD_AABB_YZ4)*(nbTotal+NB_SENTINELS))); initSentinels(dstBoxesX, nbTotal); BpHandle* PX_RESTRICT dstRemap = reinterpret_cast(PX_ALLOC(nbTotal*sizeof(BpHandle), "tmp")); PxU32 i=0; PxU32 nbToGo = nbSorted + nbToSort; while(nbToGo--) { PxU32 boxIndex; { if(nextCandidateNonSorted(PX_ALLOC(nbSleeping*sizeof(BpHandle), "tmp")); PX_FREE(mInToOut_Sleeping); mInToOut_Sleeping = inToOut_Sleeping; } else { inToOut_Sleeping = mInToOut_Sleeping; } const SIMD_AABB_X4* srcBoxesX = mUpdatedBoxes.getBoxes_X(); const SIMD_AABB_YZ4* srcBoxesYZ = mUpdatedBoxes.getBoxes_YZ(); SIMD_AABB_X4* dstBoxesX = mSleepingBoxes.getBoxes_X(); SIMD_AABB_YZ4* dstBoxesYZ = mSleepingBoxes.getBoxes_YZ(); initSentinels(dstBoxesX, nbSleeping); for(PxU32 ii=0;ii(memoryManager.frameAlloc(sizeof(PxU32)*nbUpdated)); PxU32* ranks1 = reinterpret_cast(memoryManager.frameAlloc(sizeof(PxU32)*nbUpdated)); StackRadixSort(rs, ranks0, ranks1); const PxU32* sorted; { PX_PROFILE_ZONE("Sort", contextID); sorted = rs.Sort(keys, nbUpdated).GetRanks(); } // PT: // - shuffle the remap table, store it in sorted order (we can probably use the "recyclable" array here again) // - compute bounds on-the-fly, store them in sorted order // PT: TODO: the "keys" array can be much bigger than stricly necessary here BpHandle* inToOut_Updated_Sorted; if(mUpdatedBoxes.allocate(nbUpdated)) { inToOut_Updated_Sorted = reinterpret_cast(keys); PX_FREE(mInToOut_Updated); mInToOut_Updated = inToOut_Updated_Sorted; } else { PX_FREE(keys); inToOut_Updated_Sorted = mInToOut_Updated; } SIMD_AABB_X4* PX_RESTRICT dstBoxesX = mUpdatedBoxes.getBoxes_X(); initSentinels(dstBoxesX, nbUpdated); #ifdef USE_ABP_BUCKETS Vec4V minV = V4Load(FLT_MAX); Vec4V maxV = V4Load(-FLT_MAX); #endif // PT: TODO: parallel? Everything indexed by i should be fine, things indexed by userID might have some false sharing for(PxU32 i=0;i mDelayedPairs; const ABP_Index* mInToOut0; const ABP_Index* mInToOut1; //char mBuffer[256]; }; } static PX_FORCE_INLINE void outputPair(PairManagerMT& pairManager, PxU32 index0, PxU32 index1) { pairManager.mSharedPM->addDelayedPair(pairManager.mDelayedPairs, pairManager.mInToOut0, pairManager.mInToOut1, index0, index1); } #endif #ifdef ABP_MT2 #define NB_BIP_TASKS 15 enum ABP_TaskID { ABP_TASK_0, ABP_TASK_1, }; class ABP_InternalTask : public PxLightCpuTask { public: ABP_InternalTask(ABP_TaskID id) : mBP(NULL), mID(id) {} virtual const char* getName() const PX_OVERRIDE { return "ABP_InternalTask"; } virtual void run() PX_OVERRIDE; virtual bool isHighPriority() const PX_OVERRIDE { return true; } BroadPhaseABP* mBP; ABP_TaskID mID; }; class ABP_CompleteBoxPruningStartTask; class ABP_CompleteBoxPruningTask : public PxLightCpuTask { public: ABP_CompleteBoxPruningTask() : mStartTask(NULL), mType(0), mID(0) { } virtual const char* getName() const PX_OVERRIDE { return "ABP_CompleteBoxPruningTask"; } virtual void run() PX_OVERRIDE; virtual bool isHighPriority() const PX_OVERRIDE { return true; } ABP_CompleteBoxPruningStartTask* mStartTask; PxU16 mType; PxU16 mID; PxU32 mCounter; const SIMD_AABB_X4* mBoxListX; const SIMD_AABB_YZ4* mBoxListYZ; const PxU32* mRemap; PxU32 mCounter4; const SIMD_AABB_X4* mBoxListX4; const SIMD_AABB_YZ4* mBoxListYZ4; const PxU32* mRemap4; PairManagerMT mPairs; PX_FORCE_INLINE bool isThereWorkToDo() const { if(!mCounter) return false; if(mType) return mCounter4!=0; return true; } }; class ABP_CompleteBoxPruningEndTask : public PxLightCpuTask { public: ABP_CompleteBoxPruningEndTask() : mStartTask(NULL) {} virtual const char* getName() const PX_OVERRIDE { return "ABP_CompleteBoxPruningEndTask"; } virtual void run() PX_OVERRIDE; virtual bool isHighPriority() const PX_OVERRIDE { return true; } ABP_CompleteBoxPruningStartTask* mStartTask; }; class ABP_CompleteBoxPruningStartTask : public PxLightCpuTask { public: ABP_CompleteBoxPruningStartTask(); virtual const char* getName() const PX_OVERRIDE { return "ABP_CompleteBoxPruningStartTask"; } void setup( //ABP_MM& memoryManager, const PxBounds3& updatedBounds, ABP_PairManager* PX_RESTRICT pairManager, PxU32 nb, const SIMD_AABB_X4* PX_RESTRICT listX, const SIMD_AABB_YZ4* PX_RESTRICT listYZ, const ABP_Index* PX_RESTRICT inputRemap, PxU64 contextID); void addDelayedPairs(); void addDelayedPairs2(PxArray& createdPairs); virtual void run() PX_OVERRIDE; virtual bool isHighPriority() const PX_OVERRIDE { return true; } const SIMD_AABB_X4* mListX; const SIMD_AABB_YZ4* mListYZ; const ABP_Index* mInputRemap; ABP_PairManager* mPairManager; PxU32* mRemap; SIMD_AABB_X4* mBoxListXBuffer; SIMD_AABB_YZ4* mBoxListYZBuffer; PxU32 mCounters[NB_BUCKETS]; SIMD_AABB_X4* mBoxListX[NB_BUCKETS]; SIMD_AABB_YZ4* mBoxListYZ[NB_BUCKETS]; PxU32* mRemapBase[NB_BUCKETS]; PxBounds3 mBounds; PxU32 mNb; ABP_CompleteBoxPruningTask mTasks[9]; ABP_CompleteBoxPruningEndTask mEndTask; }; #endif typedef BoxManager DynamicManager; typedef BoxManager StaticManager; class ABP : public PxUserAllocated { PX_NOCOPY(ABP) public: ABP(PxU64 contextID); ~ABP(); void preallocate(PxU32 nbObjects, PxU32 maxNbOverlaps); void reset(); void freeBuffers(); void addStaticObjects(const BpHandle* userIDs, PxU32 nb, PxU32 maxID); void addDynamicObjects(const BpHandle* userIDs, PxU32 nb, PxU32 maxID); void addKinematicObjects(const BpHandle* userIDs, PxU32 nb, PxU32 maxID); void removeObject(BpHandle userID); void updateObject(BpHandle userID); void findOverlaps(PxBaseTask* continuation, const Bp::FilterGroup::Enum* PX_RESTRICT groups, const bool* PX_RESTRICT lut); PxU32 finalize(PxArray& createdPairs, PxArray& deletedPairs); void shiftOrigin(const PxVec3& shift, const PxBounds3* boundsArray, const PxReal* contactDistances); void setTransientData(const PxBounds3* bounds, const PxReal* contactDistance); void Region_prepareOverlaps(); ABP_MM mMM; BoxManager mSBM; DynamicManager mDBM; RadixSortBuffered mRS; DynamicManager mKBM; ABP_SharedData mShared; ABP_PairManager mPairManager; const PxU64 mContextID; #ifdef ABP_MT2 ABP_InternalTask mTask0; ABP_InternalTask mTask1; ABP_CompleteBoxPruningStartTask mCompleteBoxPruningTask0; ABP_CompleteBoxPruningStartTask mCompleteBoxPruningTask1; ABP_CompleteBoxPruningTask mBipTasks[NB_BIP_TASKS]; void addDelayedPairs(); void addDelayedPairs2(PxArray& createdPairs); #endif }; #ifdef ABP_SIMD_OVERLAP #define ABP_OVERLAP_TEST(x) SIMD_OVERLAP_TEST(x) #else #define ABP_OVERLAP_TEST(x) if(intersect2D(box0, x)) #endif /////////////////////////////////////////////////////////////////////////////// ABP_PairManager::ABP_PairManager() : mGroups (NULL), mInToOut0 (NULL), mInToOut1 (NULL), mLUT (NULL) { } /////////////////////////////////////////////////////////////////////////////// ABP_PairManager::~ABP_PairManager() { } /////////////////////////////////////////////////////////////////////////////// InternalPair* ABP_PairManager::addPair(PxU32 index0, PxU32 index1) { const PxU32 id0 = mInToOut0[index0]; const PxU32 id1 = mInToOut1[index1]; PX_ASSERT(id0!=id1); PX_ASSERT(id0!=INVALID_ID); PX_ASSERT(id1!=INVALID_ID); PX_ASSERT(mGroups); { if(!groupFiltering(mGroups[id0], mGroups[id1], mLUT)) return NULL; } return addPairInternal(id0, id1); } #ifdef ABP_MT void ABP_PairManager::addDelayedPair(PxArray& delayedPairs, const ABP_Index* inToOut0, const ABP_Index* inToOut1, PxU32 index0, PxU32 index1) const { /*const*/ PxU32 id0 = inToOut0[index0]; /*const*/ PxU32 id1 = inToOut1[index1]; PX_ASSERT(id0!=id1); PX_ASSERT(id0!=INVALID_ID); PX_ASSERT(id1!=INVALID_ID); PX_ASSERT(mGroups); { if(!groupFiltering(mGroups[id0], mGroups[id1], mLUT)) return; } if(1) { // Order the ids sort(id0, id1); const PxU32 fullHashValue = hash(id0, id1); PxU32 hashValue = fullHashValue & mMask; { InternalPair* /*PX_RESTRICT*/ p = findPair(id0, id1, hashValue); if(p) { p->setUpdated(); // ### PT: potential false sharing here //return p; // Persistent pair return; // Persistent pair } } { /*// This is a new pair if(mNbActivePairs >= mHashSize) hashValue = growPairs(fullHashValue); const PxU32 pairIndex = mNbActivePairs++; InternalPair* PX_RESTRICT p = &mActivePairs[pairIndex]; p->setNewPair(id0, id1); mNext[pairIndex] = mHashTable[hashValue]; mHashTable[hashValue] = pairIndex; return p;*/ DelayedPair* newPair = Cm::reserveContainerMemory(delayedPairs, 1); newPair->mID0 = id0; newPair->mID1 = id1; newPair->mHash = fullHashValue; } } } void ABP_PairManager::resizeForNewPairs(PxU32 nbDelayedPairs) { PxU32 currentNbPairs = mNbActivePairs; const PxU32 newNbPairs = currentNbPairs + nbDelayedPairs; const PxU32 newHashSize = PxNextPowerOfTwo(newNbPairs + 1); if(newHashSize == mHashSize) return; // Get more entries mHashSize = newHashSize; mMask = newHashSize - 1; //reallocPairs(); { MBP_FREE(mHashTable); mHashTable = reinterpret_cast(MBP_ALLOC(mHashSize*sizeof(PxU32))); //storeDwords(mHashTable, mHashSize, INVALID_ID); if(0) { PxU32 nb = mHashSize; PxU32* dest = mHashTable; while(nb--) *dest++ = INVALID_ID; } else PxMemSet(mHashTable, 0xff, mHashSize*sizeof(PxU32)); // Get some bytes for new entries InternalPair* newPairs = reinterpret_cast(MBP_ALLOC(mHashSize * sizeof(InternalPair))); PX_ASSERT(newPairs); PxU32* newNext = reinterpret_cast(MBP_ALLOC(mHashSize * sizeof(PxU32))); PX_ASSERT(newNext); // Copy old data if needed if(currentNbPairs) PxMemCopy(newPairs, mActivePairs, currentNbPairs*sizeof(InternalPair)); // ### check it's actually needed... probably only for pairs whose hash value was cut by the and // yeah, since hash(id0, id1) is a constant // However it might not be needed to recompute them => only less efficient but still ok // PT: TODO: in heavy scenes like Avalanche100K the number of pairs gets close to a million, and this loop becomes very expensive. Revisit. for(PxU32 i=0;i& delayedPairs) { if(0) { PxU32 nbDelayedPairs = delayedPairs.size(); const DelayedPair* pairs = delayedPairs.begin(); while(nbDelayedPairs--) { const DelayedPair& dp = *pairs++; const PxU32 fullHashValue = dp.mHash; PxU32 hashValue = fullHashValue & mMask; if(mNbActivePairs >= mHashSize) hashValue = growPairs(fullHashValue); const PxU32 pairIndex = mNbActivePairs++; InternalPair* PX_RESTRICT p = &mActivePairs[pairIndex]; p->setNewPair(dp.mID0, dp.mID1); mNext[pairIndex] = mHashTable[hashValue]; mHashTable[hashValue] = pairIndex; } } else { PxU32 nbDelayedPairs = delayedPairs.size(); PxU32 currentNbPairs = mNbActivePairs; //resizeForNewPairs(nbDelayedPairs); { const PxU32 mask = mMask; PxU32* PX_RESTRICT hashTable = mHashTable; PxU32* PX_RESTRICT next = mNext; InternalPair* PX_RESTRICT internalPairs = mActivePairs; const DelayedPair* PX_RESTRICT pairs = delayedPairs.begin(); while(nbDelayedPairs--) { const DelayedPair& dp = *pairs++; const PxU32 fullHashValue = dp.mHash; const PxU32 hashValue = fullHashValue & mask; PX_ASSERT(currentNbPairs < mHashSize); const PxU32 pairIndex = currentNbPairs++; internalPairs[pairIndex].setNewPair(dp.mID0, dp.mID1); next[pairIndex] = hashTable[hashValue]; hashTable[hashValue] = pairIndex; } mNbActivePairs = currentNbPairs; } } } void ABP_PairManager::addDelayedPairs2(PxArray& createdPairs, const PxArray& delayedPairs) { PxU32 nbDelayedPairs = delayedPairs.size(); PxU32 currentNbPairs = mNbActivePairs; //resizeForNewPairs(nbDelayedPairs); BroadPhasePair* newPair = Cm::reserveContainerMemory(createdPairs, nbDelayedPairs); { const PxU32 mask = mMask; PxU32* PX_RESTRICT hashTable = mHashTable; PxU32* PX_RESTRICT next = mNext; InternalPair* PX_RESTRICT internalPairs = mActivePairs; const DelayedPair* PX_RESTRICT pairs = delayedPairs.begin(); while(nbDelayedPairs--) { const DelayedPair& dp = *pairs++; const PxU32 fullHashValue = dp.mHash; const PxU32 hashValue = fullHashValue & mask; PX_ASSERT(currentNbPairs < mHashSize); const PxU32 pairIndex = currentNbPairs++; internalPairs[pairIndex].setNewPair2(dp.mID0, dp.mID1); { newPair->mVolA = dp.mID0; newPair->mVolB = dp.mID1; newPair++; } next[pairIndex] = hashTable[hashValue]; hashTable[hashValue] = pairIndex; } mNbActivePairs = currentNbPairs; } } #endif /////////////////////////////////////////////////////////////////////////////// #if PX_INTEL_FAMILY #define SIMD_OVERLAP_TEST_14a(box) _mm_movemask_ps(_mm_cmpngt_ps(b, _mm_load_ps(box)))==15 #define SIMD_OVERLAP_INIT_9c(box) \ __m128 b = _mm_shuffle_ps(_mm_load_ps(&box.mMinY), _mm_load_ps(&box.mMinY), 78);\ const float Coeff = -1.0f;\ b = _mm_mul_ps(b, _mm_load1_ps(&Coeff)); #define SIMD_OVERLAP_TEST_9c(box) \ const __m128 a = _mm_load_ps(&box.mMinY); \ const __m128 d = _mm_cmpge_ps(a, b); \ if(_mm_movemask_ps(d)==15) #else #define SIMD_OVERLAP_TEST_14a(box) BAllEqFFFF(V4IsGrtr(b, V4LoadA(box))) #define SIMD_OVERLAP_INIT_9c(box) \ Vec4V b = V4PermZWXY(V4LoadA(&box.mMinY)); \ b = V4Mul(b, V4Load(-1.0f)); #define SIMD_OVERLAP_TEST_9c(box) \ const Vec4V a = V4LoadA(&box.mMinY); \ const Vec4V d = V4IsGrtrOrEq(a, b); \ if(BAllEqTTTT(d)) #endif #ifdef ABP_SIMD_OVERLAP #define SIMD_OVERLAP_PRELOAD_BOX0 SIMD_OVERLAP_INIT_9c(box0) #define SIMD_OVERLAP_TEST(x) SIMD_OVERLAP_TEST_9c(x) #else #define SIMD_OVERLAP_PRELOAD_BOX0 #endif #ifndef ABP_SIMD_OVERLAP static PX_FORCE_INLINE int intersect2D(const SIMD_AABB_YZ4& a, const SIMD_AABB_YZ4& b) { /* if( b.mMaxY < a.mMinY || a.mMaxY < b.mMinY || b.mMaxZ < a.mMinZ || a.mMaxZ < b.mMinZ ) return 0; return 1;*/ const bool b0 = b.mMaxY < a.mMinY; const bool b1 = a.mMaxY < b.mMinY; const bool b2 = b.mMaxZ < a.mMinZ; const bool b3 = a.mMaxZ < b.mMinZ; // const bool b4 = b0 || b1 || b2 || b3; const bool b4 = b0 | b1 | b2 | b3; return !b4; } #endif static PX_FORCE_INLINE void outputPair(ABP_PairManager& pairManager, PxU32 index0, PxU32 index1) { pairManager.addPair(index0, index1); } template static void boxPruningKernel( PxU32 nb0, PxU32 nb1, const SIMD_AABB_X4* PX_RESTRICT boxes0_X, const SIMD_AABB_X4* PX_RESTRICT boxes1_X, const SIMD_AABB_YZ4* PX_RESTRICT boxes0_YZ, const SIMD_AABB_YZ4* PX_RESTRICT boxes1_YZ, const ABP_Index* PX_RESTRICT inToOut0, const ABP_Index* PX_RESTRICT inToOut1, ABP_PairManagerT* PX_RESTRICT pairManager) { pairManager->mInToOut0 = inToOut0; pairManager->mInToOut1 = inToOut1; PxU32 index0 = 0; PxU32 runningIndex1 = 0; while(runningIndex1(&boxes1_YZ[runningIndex1]); const char* const CurrentBoxListX = reinterpret_cast(&boxes1_X[runningIndex1]); if(!gUnrollLoop) { while(*reinterpret_cast(CurrentBoxListX + Offset)<=maxLimit) { const float* box = reinterpret_cast(CurrentBoxListYZ + Offset*2); #ifdef ABP_SIMD_OVERLAP if(SIMD_OVERLAP_TEST_14a(box)) #else if(intersect2D(box0, *reinterpret_cast(box))) #endif { const PxU32 Index1 = PxU32(CurrentBoxListX + Offset - reinterpret_cast(boxes1_X))>>3; outputPair(*pairManager, index0, Index1); } Offset += 8; } } else { #define BIP_VERSION4 #ifdef BIP_VERSION4 #ifdef ABP_SIMD_OVERLAP #define BLOCK4(x, label) {const float* box = reinterpret_cast(CurrentBoxListYZ + Offset*2 + x*2); \ if(SIMD_OVERLAP_TEST_14a(box)) \ goto label; } #else #define BLOCK4(x, label) {const float* box = reinterpret_cast(CurrentBoxListYZ + Offset*2 + x*2); \ if(intersect2D(box0, *reinterpret_cast(box))) \ goto label; } #endif goto StartLoop4; CODEALIGN16 FoundOverlap3: Offset += 8; CODEALIGN16 FoundOverlap2: Offset += 8; CODEALIGN16 FoundOverlap1: Offset += 8; CODEALIGN16 FoundOverlap0: Offset += 8; CODEALIGN16 FoundOverlap: { const PxU32 Index1 = PxU32(CurrentBoxListX + Offset - 8 - reinterpret_cast(boxes1_X))>>3; outputPair(*pairManager, index0, Index1); } CODEALIGN16 StartLoop4: while(*reinterpret_cast(CurrentBoxListX + Offset + 8*5)<=maxLimit) { BLOCK4(0, FoundOverlap0) BLOCK4(8, FoundOverlap1) BLOCK4(16, FoundOverlap2) BLOCK4(24, FoundOverlap3) Offset += 40; BLOCK4(-8, FoundOverlap) } #undef BLOCK4 #endif #ifdef ABP_SIMD_OVERLAP #define BLOCK if(*reinterpret_cast(CurrentBoxListX + Offset)<=maxLimit) \ {if(SIMD_OVERLAP_TEST_14a(reinterpret_cast(CurrentBoxListYZ + Offset*2))) \ goto OverlapFound; \ Offset += 8; #else #define BLOCK if(*reinterpret_cast(CurrentBoxListX + Offset)<=maxLimit) \ {if(intersect2D(box0, *reinterpret_cast(CurrentBoxListYZ + Offset*2))) \ goto OverlapFound; \ Offset += 8; #endif goto LoopStart; CODEALIGN16 OverlapFound: { const PxU32 Index1 = PxU32(CurrentBoxListX + Offset - reinterpret_cast(boxes1_X))>>3; outputPair(*pairManager, index0, Index1); } Offset += 8; CODEALIGN16 LoopStart: BLOCK BLOCK BLOCK } } goto LoopStart; } #undef BLOCK } } index0++; } } template static /*PX_FORCE_INLINE*/ void doBipartiteBoxPruning_Leaf( ABP_PairManagerT* PX_RESTRICT pairManager, PxU32 nb0, PxU32 nb1, const SIMD_AABB_X4* PX_RESTRICT boxes0_X, const SIMD_AABB_X4* PX_RESTRICT boxes1_X, const SIMD_AABB_YZ4* PX_RESTRICT boxes0_YZ, const SIMD_AABB_YZ4* PX_RESTRICT boxes1_YZ, const ABP_Index* PX_RESTRICT remap0, const ABP_Index* PX_RESTRICT remap1 ) { PX_ASSERT(boxes0_X[nb0].isSentinel()); PX_ASSERT(boxes1_X[nb1].isSentinel()); boxPruningKernel<0>(nb0, nb1, boxes0_X, boxes1_X, boxes0_YZ, boxes1_YZ, remap0, remap1, pairManager); boxPruningKernel<1>(nb1, nb0, boxes1_X, boxes0_X, boxes1_YZ, boxes0_YZ, remap1, remap0, pairManager); } template static PX_FORCE_INLINE void doBipartiteBoxPruning_Leaf(ABP_PairManagerT* PX_RESTRICT pairManager, PxU32 nb0, PxU32 nb1, const SplitBoxes& boxes0, const SplitBoxes& boxes1, const ABP_Index* PX_RESTRICT remap0, const ABP_Index* PX_RESTRICT remap1) { doBipartiteBoxPruning_Leaf(pairManager, nb0, nb1, boxes0.getBoxes_X(), boxes1.getBoxes_X(), boxes0.getBoxes_YZ(), boxes1.getBoxes_YZ(), remap0, remap1); } template static void doCompleteBoxPruning_Leaf( ABP_PairManagerT* PX_RESTRICT pairManager, PxU32 nb, const SIMD_AABB_X4* PX_RESTRICT boxes_X, const SIMD_AABB_YZ4* PX_RESTRICT boxes_YZ, const ABP_Index* PX_RESTRICT remap) { pairManager->mInToOut0 = remap; pairManager->mInToOut1 = remap; PxU32 index0 = 0; PxU32 runningIndex = 0; while(runningIndex(&boxes_YZ[runningIndex]); const char* const CurrentBoxListX = reinterpret_cast(&boxes_X[runningIndex]); if(!gUnrollLoop) { while(*reinterpret_cast(CurrentBoxListX + Offset)<=maxLimit) { const float* box = reinterpret_cast(CurrentBoxListYZ + Offset*2); #ifdef ABP_SIMD_OVERLAP if(SIMD_OVERLAP_TEST_14a(box)) #else if(intersect2D(box0, *reinterpret_cast(box))) #endif { const PxU32 Index = PxU32(CurrentBoxListX + Offset - reinterpret_cast(boxes_X))>>3; outputPair(*pairManager, index0, Index); } Offset += 8; } } else { #define VERSION4c #ifdef VERSION4c #define VERSION3 // Enable this as our safe loop #ifdef ABP_SIMD_OVERLAP #define BLOCK4(x, label) {const float* box = reinterpret_cast(CurrentBoxListYZ + Offset*2 + x*2); \ if(SIMD_OVERLAP_TEST_14a(box)) \ goto label; } #else #define BLOCK4(x, label) {const SIMD_AABB_YZ4* box = reinterpret_cast(CurrentBoxListYZ + Offset*2 + x*2); \ if(intersect2D(box0, *box)) \ goto label; } #endif goto StartLoop4; CODEALIGN16 FoundOverlap3: Offset += 8; CODEALIGN16 FoundOverlap2: Offset += 8; CODEALIGN16 FoundOverlap1: Offset += 8; CODEALIGN16 FoundOverlap0: Offset += 8; CODEALIGN16 FoundOverlap: { const PxU32 Index = PxU32(CurrentBoxListX + Offset - 8 - reinterpret_cast(boxes_X))>>3; outputPair(*pairManager, index0, Index); } CODEALIGN16 StartLoop4: while(*reinterpret_cast(CurrentBoxListX + Offset + 8*5)<=maxLimit) { BLOCK4(0, FoundOverlap0) BLOCK4(8, FoundOverlap1) BLOCK4(16, FoundOverlap2) BLOCK4(24, FoundOverlap3) Offset += 40; BLOCK4(-8, FoundOverlap) } #endif #define VERSION3 #ifdef VERSION3 #ifdef ABP_SIMD_OVERLAP #define BLOCK if(*reinterpret_cast(CurrentBoxListX + Offset)<=maxLimit) \ {if(SIMD_OVERLAP_TEST_14a(reinterpret_cast(CurrentBoxListYZ + Offset*2))) \ goto BeforeLoop; \ Offset += 8; #else #define BLOCK if(*reinterpret_cast(CurrentBoxListX + Offset)<=maxLimit) \ {if(intersect2D(box0, *reinterpret_cast(CurrentBoxListYZ + Offset*2))) \ goto BeforeLoop; \ Offset += 8; #endif goto StartLoop; CODEALIGN16 BeforeLoop: { const PxU32 Index = PxU32(CurrentBoxListX + Offset - reinterpret_cast(boxes_X))>>3; outputPair(*pairManager, index0, Index); Offset += 8; } CODEALIGN16 StartLoop: BLOCK BLOCK BLOCK BLOCK BLOCK } } } } goto StartLoop; } #endif } } index0++; } } #ifdef USE_ABP_BUCKETS static const PxU8 gCodes[] = { 4, 4, 4, 255, 4, 3, 2, 255, 4, 1, 0, 255, 255, 255, 255, 255 }; static PX_FORCE_INLINE PxU8 classifyBoxNew(const SIMD_AABB_YZ4& boxYZ, const float limitY, const float limitZ) { #ifdef ABP_SIMD_OVERLAP // PT: mins have been negated for SIMD tests const bool upperPart = (-boxYZ.mMinZ) > limitZ; const bool rightPart = (-boxYZ.mMinY) > limitY; #else const bool upperPart = boxYZ.mMinZ > limitZ; const bool rightPart = boxYZ.mMinY > limitY; #endif const bool lowerPart = boxYZ.mMaxZ < limitZ; const bool leftPart = boxYZ.mMaxY < limitY; // Table-based box classification avoids many branches const PxU32 Code = PxU32(rightPart)|(PxU32(leftPart)<<1)|(PxU32(upperPart)<<2)|(PxU32(lowerPart)<<3); PX_ASSERT(gCodes[Code]!=255); return gCodes[Code]; } #ifdef RECURSE_LIMIT static void CompleteBoxPruning_Recursive( ABP_MM& memoryManager, ABP_PairManager* PX_RESTRICT pairManager, PxU32 nb, const SIMD_AABB_X4* PX_RESTRICT listX, const SIMD_AABB_YZ4* PX_RESTRICT listYZ, const ABP_Index* PX_RESTRICT remap, const ABPEntry* PX_RESTRICT objects) { // printf("CompleteBoxPruning_Recursive %d\n", nb); if(!nb) return; /*__declspec(align(16))*/ float mergedMin[4]; /*__declspec(align(16))*/ float mergedMax[4]; { //#ifdef SAFE_VERSION Vec4V maxV = V4LoadA(&listYZ[0].mMinY); for(PxU32 i=1;i(memoryManager.frameAlloc(sizeof(SIMD_AABB_X4)*(nb+NB_SENTINELS*NB_BUCKETS))); SIMD_AABB_YZ4* BoxListYZBuffer = reinterpret_cast(memoryManager.frameAlloc(sizeof(SIMD_AABB_YZ4)*nb)); PxU32 Counters[NB_BUCKETS]; for(PxU32 i=0;i(memoryManager.frameAlloc(sizeof(PxU32)*nb)); PxU8* Indices = reinterpret_cast(memoryManager.frameAlloc(sizeof(PxU8)*nb)); for(PxU32 i=0;imRemap); PX_FREE(mStartTask->mBoxListYZBuffer); PX_FREE(mStartTask->mBoxListXBuffer); } ABP_CompleteBoxPruningStartTask::ABP_CompleteBoxPruningStartTask() : mListX (NULL), mListYZ (NULL), mInputRemap (NULL), mPairManager (NULL), mRemap (NULL), mBoxListXBuffer (NULL), mBoxListYZBuffer(NULL), mNb (0) { } void ABP_CompleteBoxPruningStartTask::setup( //ABP_MM& memoryManager, const PxBounds3& updatedBounds, ABP_PairManager* PX_RESTRICT pairManager, PxU32 nb, const SIMD_AABB_X4* PX_RESTRICT listX, const SIMD_AABB_YZ4* PX_RESTRICT listYZ, const ABP_Index* PX_RESTRICT inputRemap, PxU64 contextID) { mListX = listX; mListYZ = listYZ; mInputRemap = inputRemap; mPairManager = pairManager; mBounds = updatedBounds; mContextID = contextID; mNb = nb; // PT: TODO: revisit allocs //mBoxListXBuffer = reinterpret_cast(memoryManager.frameAlloc(sizeof(SIMD_AABB_X4)*(nb+NB_SENTINELS*NB_BUCKETS))); //mBoxListYZBuffer = reinterpret_cast(memoryManager.frameAlloc(sizeof(SIMD_AABB_YZ4)*nb)); mBoxListXBuffer = reinterpret_cast(PX_ALLOC(sizeof(SIMD_AABB_X4)*(nb+NB_SENTINELS*NB_BUCKETS), "mBoxListXBuffer")); mBoxListYZBuffer = reinterpret_cast(PX_ALLOC(sizeof(SIMD_AABB_YZ4)*nb, "mBoxListYZBuffer")); //mRemap = reinterpret_cast(memoryManager.frameAlloc(sizeof(PxU32)*nb)); mRemap = reinterpret_cast(PX_ALLOC(sizeof(PxU32)*nb, "mRemap")); mEndTask.mStartTask = this; for(PxU32 i=0;i<9;i++) mTasks[i].mStartTask = this; } void ABP_CompleteBoxPruningStartTask::run() { // printf("Running ABP_CompleteBoxPruningStartTask\n"); const SIMD_AABB_X4* PX_RESTRICT listX = mListX; const SIMD_AABB_YZ4* PX_RESTRICT listYZ = mListYZ; const ABP_Index* PX_RESTRICT remap = mInputRemap; const PxU32 nb = mNb; PxU32* PX_RESTRICT Remap = mRemap; SIMD_AABB_X4* PX_RESTRICT BoxListXBuffer = mBoxListXBuffer; SIMD_AABB_YZ4* PX_RESTRICT BoxListYZBuffer = mBoxListYZBuffer; PxU32* PX_RESTRICT Counters = mCounters; SIMD_AABB_X4** PX_RESTRICT BoxListX = mBoxListX; SIMD_AABB_YZ4** PX_RESTRICT BoxListYZ = mBoxListYZ; PxU32** PX_RESTRICT RemapBase = mRemapBase; { PX_PROFILE_ZONE("ABP_CompleteBoxPruningStartTask - Run", mContextID); // PT: TODO: revisit allocs //BoxListXBuffer = reinterpret_cast(memoryManager.frameAlloc(sizeof(SIMD_AABB_X4)*(nb+NB_SENTINELS*NB_BUCKETS))); //BoxListYZBuffer = reinterpret_cast(memoryManager.frameAlloc(sizeof(SIMD_AABB_YZ4)*nb)); const PxVec3& mergedMin = mBounds.minimum; const PxVec3& mergedMax = mBounds.maximum; const float limitY = (mergedMax[1] + mergedMin[1]) * 0.5f; const float limitZ = (mergedMax[2] + mergedMin[2]) * 0.5f; for(PxU32 i=0;i(memoryManager.frameAlloc(sizeof(PxU32)*nb)); // PT: TODO: revisit allocs //PxU8* Indices = reinterpret_cast(memoryManager.frameAlloc(sizeof(PxU8)*nb)); PxU8* Indices = reinterpret_cast(PX_ALLOC(sizeof(PxU8)*nb, "Indices")); { PX_PROFILE_ZONE("BoxPruning - ClassifyBoxes", mContextID); for(PxU32 i=0;iresizeForNewPairs(nbDelayedPairs); } for(PxU32 k=0; k<9; k++) mPairManager->addDelayedPairs(mTasks[k].mPairs.mDelayedPairs); } } void ABP_CompleteBoxPruningStartTask::addDelayedPairs2(PxArray& createdPairs) { PX_PROFILE_ZONE("ABP_CompleteBoxPruningStartTask - add delayed pairs", mContextID); PxU32 nbDelayedPairs = 0; for(PxU32 k=0; k<9; k++) nbDelayedPairs += mTasks[k].mPairs.mDelayedPairs.size(); if(nbDelayedPairs) { { PX_PROFILE_ZONE("BroadPhaseABP - resizeForNewPairs", mContextID); mPairManager->resizeForNewPairs(nbDelayedPairs); } for(PxU32 k=0; k<9; k++) mPairManager->addDelayedPairs2(createdPairs, mTasks[k].mPairs.mDelayedPairs); } } #endif #ifndef USE_ALTERNATIVE_VERSION static void CompleteBoxPruning_Version16( #ifdef ABP_MT2 ABP_CompleteBoxPruningStartTask& completeBoxPruningTask, #endif ABP_MM& memoryManager, const PxBounds3& updatedBounds, ABP_PairManager* PX_RESTRICT pairManager, PxU32 nb, const SIMD_AABB_X4* PX_RESTRICT listX, const SIMD_AABB_YZ4* PX_RESTRICT listYZ, const ABP_Index* PX_RESTRICT remap, PxBaseTask* continuation, PxU64 contextID) { PX_UNUSED(contextID); PX_UNUSED(continuation); if(!nb) return; #ifdef ABP_MT2 if(continuation) { completeBoxPruningTask.setup(updatedBounds, pairManager, nb, listX, listYZ, remap, contextID); completeBoxPruningTask.mEndTask.setContinuation(continuation); completeBoxPruningTask.setContinuation(&completeBoxPruningTask.mEndTask); completeBoxPruningTask.mEndTask.removeReference(); completeBoxPruningTask.removeReference(); return; } #endif PxU32* Remap; SIMD_AABB_X4* BoxListXBuffer; SIMD_AABB_YZ4* BoxListYZBuffer; PxU32 Counters[NB_BUCKETS]; SIMD_AABB_X4* BoxListX[NB_BUCKETS]; SIMD_AABB_YZ4* BoxListYZ[NB_BUCKETS]; PxU32* RemapBase[NB_BUCKETS]; { PX_PROFILE_ZONE("BoxPruning - PrepareData", contextID); // PT: TODO: revisit allocs BoxListXBuffer = reinterpret_cast(memoryManager.frameAlloc(sizeof(SIMD_AABB_X4)*(nb+NB_SENTINELS*NB_BUCKETS))); BoxListYZBuffer = reinterpret_cast(memoryManager.frameAlloc(sizeof(SIMD_AABB_YZ4)*nb)); const PxVec3& mergedMin = updatedBounds.minimum; const PxVec3& mergedMax = updatedBounds.maximum; const float limitY = (mergedMax[1] + mergedMin[1]) * 0.5f; const float limitZ = (mergedMax[2] + mergedMin[2]) * 0.5f; for(PxU32 i=0;i(memoryManager.frameAlloc(sizeof(PxU32)*nb)); PxU8* Indices = reinterpret_cast(memoryManager.frameAlloc(sizeof(PxU8)*nb)); { PX_PROFILE_ZONE("BoxPruning - ClassifyBoxes", contextID); for(PxU32 i=0;iUSE_ABP_BUCKETS) CompleteBoxPruning_Version16( #ifdef ABP_MT2 completeBoxPruningTask, #endif memoryManager, mDBM.getUpdatedBounds(), pairManager, nbUpdated, updatedDynamicBoxes_X, updatedDynamicBoxes_YZ, mDBM.getRemap_Updated(), continuation, contextID); else #endif { #ifdef ABP_MT2 if(continuation) { bipTask1.mCounter = nbUpdated; bipTask1.mBoxListX = updatedDynamicBoxes_X; bipTask1.mBoxListYZ = updatedDynamicBoxes_YZ; bipTask1.mRemap = mDBM.getRemap_Updated(); bipTask1.mType = 0; bipTask1.mPairs.mSharedPM = pairManager; //bipTask1.mPairs.mDelayedPairs.reserve(10000); if(bipTask1.isThereWorkToDo()) { bipTask1.mID = 0; bipTask1.setContinuation(continuation); bipTask1.removeReference(); } } else #endif doCompleteBoxPruning_Leaf( pairManager, nbUpdated, updatedDynamicBoxes_X, updatedDynamicBoxes_YZ, mDBM.getRemap_Updated()); } } } void ABP::Region_prepareOverlaps() { PX_PROFILE_ZONE("ABP - Region_prepareOverlaps", mContextID); if( !mDBM.isThereWorkToDo() && !mKBM.isThereWorkToDo() && !mSBM.isThereWorkToDo() ) return; if(mSBM.isThereWorkToDo()) mSBM.prepareData(mRS, mShared.mABP_Objects, mShared.mABP_Objects_Capacity, mMM, mContextID); mDBM.prepareData(mRS, mShared.mABP_Objects, mShared.mABP_Objects_Capacity, mMM, mContextID); mKBM.prepareData(mRS, mShared.mABP_Objects, mShared.mABP_Objects_Capacity, mMM, mContextID); mRS.reset(); } // Finds static-vs-dynamic and dynamic-vs-dynamic overlaps static void findAllOverlaps( #ifdef ABP_MT2 ABP_CompleteBoxPruningStartTask& completeBoxPruningTask, ABP_CompleteBoxPruningTask& bipTask0, ABP_CompleteBoxPruningTask& bipTask1, ABP_CompleteBoxPruningTask& bipTask2, ABP_CompleteBoxPruningTask& bipTask3, ABP_CompleteBoxPruningTask& bipTask4, #endif ABP_MM& memoryManager, ABP_PairManager& pairManager, const StaticManager& mSBM, const DynamicManager& mDBM, bool doComplete, bool doBipartite, PxBaseTask* continuation, PxU64 contextID) { const PxU32 nbUpdatedBoxesDynamic = mDBM.getNbUpdatedBoxes(); // PT: find dynamics-vs-dynamics overlaps if(doComplete) doCompleteBoxPruning_( #ifdef ABP_MT2 completeBoxPruningTask, bipTask3, bipTask4, #endif memoryManager, &pairManager, mDBM, continuation, contextID); // PT: find dynamics-vs-statics overlaps if(doBipartite) { const PxU32 nbUpdatedBoxesStatic = mSBM.getNbUpdatedBoxes(); const PxU32 nbNonUpdatedBoxesStatic = mSBM.getNbNonUpdatedBoxes(); const PxU32 nbNonUpdatedBoxesDynamic = mDBM.getNbNonUpdatedBoxes(); // PT: in previous versions we did active-dynamics-vs-all-statics here. if(nbUpdatedBoxesDynamic) { if(nbUpdatedBoxesStatic) { // PT: active static vs active dynamic #ifdef ABP_MT2 if(continuation) { bipTask0.mCounter = nbUpdatedBoxesDynamic; bipTask0.mBoxListX = mDBM.getUpdatedBoxes().getBoxes_X(); bipTask0.mBoxListYZ = mDBM.getUpdatedBoxes().getBoxes_YZ(); bipTask0.mRemap = mDBM.getRemap_Updated(); bipTask0.mType = 1; bipTask0.mCounter4 = nbUpdatedBoxesStatic; bipTask0.mBoxListX4 = mSBM.getUpdatedBoxes().getBoxes_X(); bipTask0.mBoxListYZ4 = mSBM.getUpdatedBoxes().getBoxes_YZ(); bipTask0.mRemap4 = mSBM.getRemap_Updated(); bipTask0.mPairs.mSharedPM = &pairManager; //bipTask0.mPairs.mDelayedPairs.reserve(10000); if(bipTask0.isThereWorkToDo()) { bipTask0.mID = 0; bipTask0.setContinuation(continuation); bipTask0.removeReference(); } } else #endif doBipartiteBoxPruning_Leaf( &pairManager, nbUpdatedBoxesDynamic, nbUpdatedBoxesStatic, mDBM.getUpdatedBoxes(), mSBM.getUpdatedBoxes(), mDBM.getRemap_Updated(), mSBM.getRemap_Updated()); } if(nbNonUpdatedBoxesStatic) { // PT: sleeping static vs active dynamic #ifdef ABP_MT2 if(continuation) { bipTask1.mCounter = nbUpdatedBoxesDynamic; bipTask1.mBoxListX = mDBM.getUpdatedBoxes().getBoxes_X(); bipTask1.mBoxListYZ = mDBM.getUpdatedBoxes().getBoxes_YZ(); bipTask1.mRemap = mDBM.getRemap_Updated(); bipTask1.mType = 1; bipTask1.mCounter4 = nbNonUpdatedBoxesStatic; bipTask1.mBoxListX4 = mSBM.getSleepingBoxes().getBoxes_X(); bipTask1.mBoxListYZ4 = mSBM.getSleepingBoxes().getBoxes_YZ(); bipTask1.mRemap4 = mSBM.getRemap_Sleeping(); bipTask1.mPairs.mSharedPM = &pairManager; //bipTask1.mPairs.mDelayedPairs.reserve(10000); if(bipTask1.isThereWorkToDo()) { bipTask1.mID = 0; bipTask1.setContinuation(continuation); bipTask1.removeReference(); } } else #endif doBipartiteBoxPruning_Leaf( &pairManager, nbUpdatedBoxesDynamic, nbNonUpdatedBoxesStatic, mDBM.getUpdatedBoxes(), mSBM.getSleepingBoxes(), mDBM.getRemap_Updated(), mSBM.getRemap_Sleeping()); } } if(nbUpdatedBoxesStatic && nbNonUpdatedBoxesDynamic) { // PT: active static vs sleeping dynamic #ifdef ABP_MT2 if(continuation) { bipTask2.mCounter = nbNonUpdatedBoxesDynamic; bipTask2.mBoxListX = mDBM.getSleepingBoxes().getBoxes_X(); bipTask2.mBoxListYZ = mDBM.getSleepingBoxes().getBoxes_YZ(); bipTask2.mRemap = mDBM.getRemap_Sleeping(); bipTask2.mType = 1; bipTask2.mCounter4 = nbUpdatedBoxesStatic; bipTask2.mBoxListX4 = mSBM.getUpdatedBoxes().getBoxes_X(); bipTask2.mBoxListYZ4 = mSBM.getUpdatedBoxes().getBoxes_YZ(); bipTask2.mRemap4 = mSBM.getRemap_Updated(); bipTask2.mPairs.mSharedPM = &pairManager; //bipTask2.mPairs.mDelayedPairs.reserve(10000); if(bipTask2.isThereWorkToDo()) { bipTask2.mID = 0; bipTask2.setContinuation(continuation); bipTask2.removeReference(); } } else #endif doBipartiteBoxPruning_Leaf( &pairManager, nbNonUpdatedBoxesDynamic, nbUpdatedBoxesStatic, mDBM.getSleepingBoxes(), mSBM.getUpdatedBoxes(), mDBM.getRemap_Sleeping(), mSBM.getRemap_Updated()); } } } /////////////////////////////////////////////////////////////////////////// ABP::ABP(PxU64 contextID) : mSBM (FilterType::STATIC), mDBM (FilterType::DYNAMIC), mKBM (FilterType::KINEMATIC), mContextID (contextID) #ifdef ABP_MT2 ,mTask0 (ABP_TASK_0) ,mTask1 (ABP_TASK_1) #endif { #ifdef ABP_MT2 mTask0.setContextId(mContextID); mTask1.setContextId(mContextID); mCompleteBoxPruningTask0.setContextId(mContextID); mCompleteBoxPruningTask1.setContextId(mContextID); for(PxU32 k=0; k<9; k++) { mCompleteBoxPruningTask0.mTasks[k].setContextId(mContextID); mCompleteBoxPruningTask1.mTasks[k].setContextId(mContextID); } for(PxU32 k=0; kremoveObject(object, userID); object.invalidateIndex(); #if PX_DEBUG object.mUpdated = false; #endif } void ABP::updateObject(BpHandle userID) { mShared.mUpdatedObjects.setBitChecked(userID); PX_ASSERT(userIDupdateObject(object, userID); } // PT: TODO: replace bits with timestamps? void ABP_PairManager::computeCreatedDeletedPairs(PxArray& createdPairs, PxArray& deletedPairs, const BitArray& updated, const BitArray& removed) { // PT: parse all currently active pairs. The goal here is to generate the found/lost pairs, compared to previous frame. // PT: TODO: MT? PxU32 i=0; PxU32 nbActivePairs = mNbActivePairs; while(imVolA = id0; newPair->mVolB = id1; // PT: TODO: replace this with bitmaps? p.clearNew(); p.clearUpdated(); i++; } else if(p.isUpdated()) { // Persistent pair // PT: this pair already existed in the structure, and has been found again this frame. Since // MBP reports "all pairs" each frame (as opposed to SAP), this happens quite often, for each // active persistent pair. p.clearUpdated(); i++; } else { // Lost pair // PT: if the pair is not new and not 'updated', it might be a lost (separated) pair. But this // is not always the case since we now handle "sleeping" objects directly within MBP. A pair // of sleeping objects does not generate an 'addPair' call, so it ends up in this codepath. // Nonetheless the sleeping pair should not be deleted. We can only delete pairs involving // objects that have been actually moved during the frame. This is the only case in which // a pair can indeed become 'lost'. const PxU32 id0 = p.getId0(); const PxU32 id1 = p.getId1(); PX_ASSERT(id0!=INVALID_ID); PX_ASSERT(id1!=INVALID_ID); // PT: if none of the involved objects have been updated, the pair is just sleeping: keep it and skip it. if(updated.isSetChecked(id0) || updated.isSetChecked(id1)) { // PT: by design (for better or worse) we do not report pairs to the client when // one of the involved objects has been deleted. The pair must still be deleted // from the MBP structure though. if(!removed.isSetChecked(id0) && !removed.isSetChecked(id1)) { // PT: doing the group-based filtering here is useless. The pair should not have // been added in the first place. //deletedPairs.pushBack(BroadPhasePair(id0, id1)); BroadPhasePair* lostPair = Cm::reserveContainerMemory(deletedPairs, 1); lostPair->mVolA = id0; lostPair->mVolB = id1; } const PxU32 hashValue = hash(id0, id1) & mMask; removePair(id0, id1, hashValue, i); nbActivePairs--; } else i++; } } shrinkMemory(); } void ABP::findOverlaps(PxBaseTask* continuation, const Bp::FilterGroup::Enum* PX_RESTRICT groups, const bool* PX_RESTRICT lut) { PX_PROFILE_ZONE("ABP - findOverlaps", mContextID); mPairManager.mGroups = groups; mPairManager.mLUT = lut; if(!gPrepareOverlapsFlag) Region_prepareOverlaps(); bool doKineKine = true; bool doStaticKine = true; { doStaticKine = lut[Bp::FilterType::KINEMATIC*Bp::FilterType::COUNT + Bp::FilterType::STATIC]; doKineKine = lut[Bp::FilterType::KINEMATIC*Bp::FilterType::COUNT + Bp::FilterType::KINEMATIC]; } // Static-vs-dynamic (bipartite) and dynamic-vs-dynamic (complete) findAllOverlaps( #ifdef ABP_MT2 mCompleteBoxPruningTask0, mBipTasks[0], mBipTasks[1], mBipTasks[2], mBipTasks[3], mBipTasks[4], #endif mMM, mPairManager, mSBM, mDBM, true, true, continuation, mContextID); // Static-vs-kinematics (bipartite) and kinematics-vs-kinematics (complete) findAllOverlaps( #ifdef ABP_MT2 mCompleteBoxPruningTask1, mBipTasks[5], mBipTasks[6], mBipTasks[7], mBipTasks[8], mBipTasks[9], #endif mMM, mPairManager, mSBM, mKBM, doKineKine, doStaticKine, continuation, mContextID); if(1) { findAllOverlaps( #ifdef ABP_MT2 mCompleteBoxPruningTask1, mBipTasks[10], mBipTasks[11], mBipTasks[12], mBipTasks[13], mBipTasks[14], #endif mMM, mPairManager, mKBM, mDBM, false, true, continuation, mContextID); } else { const PxU32 nbUpdatedDynamics = mDBM.getNbUpdatedBoxes(); const PxU32 nbNonUpdatedDynamics = mDBM.getNbNonUpdatedBoxes(); const PxU32 nbUpdatedKinematics = mKBM.getNbUpdatedBoxes(); const PxU32 nbNonUpdatedKinematics = mKBM.getNbNonUpdatedBoxes(); if(nbUpdatedDynamics) { // Active dynamics vs active kinematics if(nbUpdatedKinematics) { doBipartiteBoxPruning_Leaf( &mPairManager, nbUpdatedDynamics, nbUpdatedKinematics, mDBM.getUpdatedBoxes(), mKBM.getUpdatedBoxes(), mDBM.getRemap_Updated(), mKBM.getRemap_Updated()); } // Active dynamics vs inactive kinematics if(nbNonUpdatedKinematics) { doBipartiteBoxPruning_Leaf( &mPairManager, nbUpdatedDynamics, nbNonUpdatedKinematics, mDBM.getUpdatedBoxes(), mKBM.getSleepingBoxes(), mDBM.getRemap_Updated(), mKBM.getRemap_Sleeping()); } } if(nbUpdatedKinematics && nbNonUpdatedDynamics) { // Inactive dynamics vs active kinematics doBipartiteBoxPruning_Leaf( &mPairManager, nbNonUpdatedDynamics, nbUpdatedKinematics, mDBM.getSleepingBoxes(), mKBM.getUpdatedBoxes(), mDBM.getRemap_Sleeping(), mKBM.getRemap_Updated()); } } } PxU32 ABP::finalize(PxArray& createdPairs, PxArray& deletedPairs) { PX_PROFILE_ZONE("ABP - finalize", mContextID); { PX_PROFILE_ZONE("computeCreatedDeletedPairs", mContextID); mPairManager.computeCreatedDeletedPairs(createdPairs, deletedPairs, mShared.mUpdatedObjects, mShared.mRemovedObjects); } mShared.mUpdatedObjects.clearAll(); return mPairManager.mNbActivePairs; } #ifdef ABP_MT2 void ABP::addDelayedPairs() { PX_PROFILE_ZONE("ABP - addDelayedPairs", mContextID); mCompleteBoxPruningTask0.addDelayedPairs(); mCompleteBoxPruningTask1.addDelayedPairs(); PxU32 nbDelayedPairs = 0; for(PxU32 k=0; k& createdPairs) { PX_PROFILE_ZONE("ABP - addDelayedPairs", mContextID); mCompleteBoxPruningTask0.addDelayedPairs2(createdPairs); mCompleteBoxPruningTask1.addDelayedPairs2(createdPairs); PxU32 nbDelayedPairs = 0; for(PxU32 k=0; kpreallocate(nbObjects, maxNbBroadPhaseOverlaps); mCreated.reserve(DEFAULT_CREATED_DELETED_PAIRS_CAPACITY); mDeleted.reserve(DEFAULT_CREATED_DELETED_PAIRS_CAPACITY); } BroadPhaseABP::~BroadPhaseABP() { PX_DELETE(mABP); } void BroadPhaseABP::update(PxcScratchAllocator* scratchAllocator, const BroadPhaseUpdateData& updateData, PxBaseTask* continuation) { PX_PROFILE_ZONE("BroadPhaseABP - update", mContextID); PX_CHECK_AND_RETURN(scratchAllocator, "BroadPhaseABP::update - scratchAllocator must be non-NULL \n"); { PX_PROFILE_ZONE("BroadPhaseABP - setup", mContextID); mABP->mMM.mScratchAllocator = scratchAllocator; mABP->setTransientData(updateData.getAABBs(), updateData.getContactDistance()); const PxU32 newCapacity = updateData.getCapacity(); mABP->mShared.checkResize(newCapacity); #if PX_CHECKED // PT: WARNING: this must be done after the allocateMappingArray call if(!BroadPhaseUpdateData::isValid(updateData, *this, false, mContextID)) { PX_CHECK_MSG(false, "Illegal BroadPhaseUpdateData \n"); return; } #endif mGroups = updateData.getGroups(); mFilter = &updateData.getFilter(); mNbAdded = updateData.getNumCreatedHandles(); mNbUpdated = updateData.getNumUpdatedHandles(); mNbRemoved = updateData.getNumRemovedHandles(); mCreatedHandles = updateData.getCreatedHandles(); mUpdatedHandles = updateData.getUpdatedHandles(); mRemovedHandles = updateData.getRemovedHandles(); } // PT: run single-threaded if forced to do so if(!mEnableMT) continuation = NULL; #ifdef ABP_MT2 if(continuation) { mABP->mTask1.mBP = this; mABP->mTask1.setContinuation(continuation); mABP->mTask0.mBP = this; mABP->mTask0.setContinuation(&mABP->mTask1); mABP->mTask1.removeReference(); mABP->mTask0.removeReference(); } else #endif { { PX_PROFILE_ZONE("BroadPhaseABP - setUpdateData", mContextID); removeObjects(); addObjects(); updateObjects(); PX_ASSERT(!mCreated.size()); PX_ASSERT(!mDeleted.size()); if(gPrepareOverlapsFlag) mABP->Region_prepareOverlaps(); } { PX_PROFILE_ZONE("BroadPhaseABP - update", mContextID); mABP->findOverlaps(continuation, mGroups, mFilter->getLUT()); } { PX_PROFILE_ZONE("BroadPhaseABP - postUpdate", mContextID); mABP->finalize(mCreated, mDeleted); } } } #ifdef ABP_MT2 void ABP_InternalTask::run() { PX_SIMD_GUARD internalABP::ABP* abp = mBP->mABP; if(mID==ABP_TASK_0) { { PX_PROFILE_ZONE("ABP_InternalTask - setUpdateData", mContextID); mBP->removeObjects(); mBP->addObjects(); mBP->updateObjects(); PX_ASSERT(!mBP->mCreated.size()); PX_ASSERT(!mBP->mDeleted.size()); if(gPrepareOverlapsFlag) abp->Region_prepareOverlaps(); } { PX_PROFILE_ZONE("ABP_InternalTask - update", mContextID); for(PxU32 k=0;k<9;k++) { abp->mCompleteBoxPruningTask0.mTasks[k].mPairs.mDelayedPairs.resetOrClear(); abp->mCompleteBoxPruningTask1.mTasks[k].mPairs.mDelayedPairs.resetOrClear(); } for(PxU32 k=0;kmBipTasks[k].mPairs.mDelayedPairs.resetOrClear(); abp->findOverlaps(getContinuation(), mBP->mGroups, mBP->mFilter->getLUT()); } } else if(mID==ABP_TASK_1) { //abp->addDelayedPairs(); //abp->finalize(mBP->mCreated, mBP->mDeleted); abp->finalize(mBP->mCreated, mBP->mDeleted); abp->addDelayedPairs2(mBP->mCreated); } } #endif void BroadPhaseABP::removeObjects() { PX_PROFILE_ZONE("BroadPhaseABP - removeObjects", mContextID); PxU32 nbRemoved = mNbRemoved; const BpHandle* removed = mRemovedHandles; if(!nbRemoved || !removed) return; while(nbRemoved--) { const BpHandle index = *removed++; PX_ASSERT(index+1mShared.mABP_Objects_Capacity); // PT: we allocated one more box on purpose mABP->removeObject(index); } } void BroadPhaseABP::updateObjects() { PX_PROFILE_ZONE("BroadPhaseABP - updateObjects", mContextID); PxU32 nbUpdated = mNbUpdated; const BpHandle* updated = mUpdatedHandles; if(!nbUpdated || !updated) return; while(nbUpdated--) { const BpHandle index = *updated++; PX_ASSERT(index+1mShared.mABP_Objects_Capacity); // PT: we allocated one more box on purpose mABP->updateObject(index); } } void BroadPhaseABP::addObjects() { PX_PROFILE_ZONE("BroadPhaseABP - addObjects", mContextID); PxU32 nbAdded = mNbAdded; const BpHandle* created = mCreatedHandles; if(!nbAdded || !created) return; const Bp::FilterGroup::Enum* PX_RESTRICT groups = mGroups; struct Batch { PX_FORCE_INLINE Batch() : mNb(0), mMaxIndex(0) {} PxU32 mNb; PxU32 mMaxIndex; BpHandle mIndices[ABP_BATCHING]; PX_FORCE_INLINE void add(const BpHandle index, internalABP::ABP* PX_RESTRICT abp, FilterType::Enum type) { PxU32 nb = mNb; mMaxIndex = PxMax(mMaxIndex, index); mIndices[nb++] = index; if(nb==ABP_BATCHING) { mNb = 0; // PT: TODO: we could use a function ptr here if(type==FilterType::STATIC) abp->addStaticObjects(mIndices, ABP_BATCHING, mMaxIndex); else if(type==FilterType::KINEMATIC) abp->addKinematicObjects(mIndices, ABP_BATCHING, mMaxIndex); else { PX_ASSERT(type==FilterType::DYNAMIC || type==FilterType::AGGREGATE); abp->addDynamicObjects(mIndices, ABP_BATCHING, mMaxIndex); } mMaxIndex = 0; } else mNb = nb; } }; Batch statics; Batch dynamics; Batch kinematics; Batch* batches[FilterType::COUNT] = {NULL}; batches[FilterType::STATIC] = &statics; batches[FilterType::DYNAMIC] = &dynamics; batches[FilterType::AGGREGATE] = &dynamics; batches[FilterType::KINEMATIC] = &kinematics; while(nbAdded--) { const BpHandle index = *created++; PX_ASSERT(index+1mShared.mABP_Objects_Capacity); // PT: we allocated one more box on purpose FilterType::Enum type = FilterType::Enum(groups[index] & BP_FILTERING_TYPE_MASK); if(!batches[type]) type = FilterType::DYNAMIC; batches[type]->add(index, mABP, type); } if(statics.mNb) mABP->addStaticObjects(statics.mIndices, statics.mNb, statics.mMaxIndex); if(kinematics.mNb) mABP->addKinematicObjects(kinematics.mIndices, kinematics.mNb, kinematics.mMaxIndex); if(dynamics.mNb) mABP->addDynamicObjects(dynamics.mIndices, dynamics.mNb, dynamics.mMaxIndex); } const BroadPhasePair* BroadPhaseABP::getCreatedPairs(PxU32& nbCreatedPairs) const { nbCreatedPairs = mCreated.size(); return mCreated.begin(); } const BroadPhasePair* BroadPhaseABP::getDeletedPairs(PxU32& nbDeletedPairs) const { nbDeletedPairs = mDeleted.size(); return mDeleted.begin(); } static void freeBuffer(PxArray& buffer) { const PxU32 size = buffer.size(); if(size>DEFAULT_CREATED_DELETED_PAIRS_CAPACITY) { buffer.reset(); buffer.reserve(DEFAULT_CREATED_DELETED_PAIRS_CAPACITY); } else { buffer.clear(); } } void BroadPhaseABP::freeBuffers() { PX_PROFILE_ZONE("BroadPhaseABP - freeBuffers", mContextID); mABP->freeBuffers(); freeBuffer(mCreated); freeBuffer(mDeleted); } #if PX_CHECKED bool BroadPhaseABP::isValid(const BroadPhaseUpdateData& updateData) const { const PxU32 nbObjects = mABP->mShared.mABP_Objects_Capacity; PX_UNUSED(nbObjects); const ABP_Object* PX_RESTRICT objects = mABP->mShared.mABP_Objects; const BpHandle* created = updateData.getCreatedHandles(); if(created) { PxU32 nbToGo = updateData.getNumCreatedHandles(); while(nbToGo--) { const BpHandle index = *created++; PX_ASSERT(indexshiftOrigin(shift, boundsArray, contactDistances); }