engine/third_party/physx/source/geomutils/src/GuBucketPruner.cpp

// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of NVIDIA CORPORATION nor the names of its
//    contributors may be used to endorse or promote products derived
//    from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.  

#include "foundation/PxMemory.h"
#include "foundation/PxBitUtils.h"
#include "GuBucketPruner.h"
#include "GuInternal.h"
#include "CmVisualization.h"
#include "CmRadixSort.h"

using namespace physx;
using namespace aos;
using namespace Gu;

#define INVALID_HANDLE	0xffffffff

/*
TODO:
- if Core is always available, mSortedObjects could be replaced with just indices to mCoreObjects => less memory.
- UTS:
	- test that queries against empty boxes all return false
- invalidate after 16 removes
- check shiftOrigin stuff (esp what happens to emptied boxes)
	- isn't there a very hard-to-find bug waiting to happen in there,
	when the shift touches the empty box and overrides mdata0/mdata1 with "wrong" values that break the sort?
- revisit updateObject/removeObject
- optimize/cache computation of free global bounds before clipRay

- remove temp memory buffers (sorted arrays)
- take care of code duplication
- better code to generate SIMD 0x7fffffff
- refactor SIMD tests
- optimize:
	- better split values
	- optimize update (bitmap, less data copy, etc)
	- use ray limits in traversal code too?
	- the SIMD XBOX code operates on Min/Max rather than C/E. Change format?
	- or just try the alternative ray-box code (as on PC) ==> pretty much exactly the same speed
*/

//#define VERIFY_SORT
//#define BRUTE_FORCE_LIMIT	32
#define LOCAL_SIZE	256			// Size of various local arrays. Dynamic allocations occur if exceeded.
#define USE_SIMD				// Use SIMD code or not (sanity performance check)
#define NODE_SORT				// Enable/disable node sorting
#define NODE_SORT_MIN_COUNT	16	// Limit above which node sorting is performed
#if PX_INTEL_FAMILY
	#if COMPILE_VECTOR_INTRINSICS
		#define CAN_USE_MOVEMASK
	#endif
#endif

#define ALIGN16(size) ((unsigned(size)+15) & unsigned(~15))

#ifdef _DEBUG
	#define AlignedLoad		V4LoadU
	#define AlignedStore	V4StoreU
#else
	#define AlignedLoad		V4LoadA
	#define AlignedStore	V4StoreA
#endif

// SAT-based ray-box overlap test has accuracy issues for long rays, so we clip them against the global AABB to limit these issues.
static void clipRay(const PxVec3& rayOrig, const PxVec3& rayDir, float& maxDist, const PxVec3& boxMin, const PxVec3& boxMax)
{
	const PxVec3 boxCenter = (boxMax + boxMin)*0.5f;
	const PxVec3 boxExtents = (boxMax - boxMin)*0.5f;
	const float dpc = boxCenter.dot(rayDir);
	const float extentsMagnitude = boxExtents.magnitude();
	const float dpMin = dpc - extentsMagnitude;
	const float dpMax = dpc + extentsMagnitude;
	const float dpO = rayOrig.dot(rayDir);
	const float boxLength = extentsMagnitude * 2.0f;
	const float distToBox = PxMin(PxAbs(dpMin - dpO), PxAbs(dpMax - dpO));
	maxDist = distToBox + boxLength * 2.0f;
}

BucketPrunerNode::BucketPrunerNode()
{
	for(PxU32 i=0;i<5;i++)
		mBucketBox[i].setEmpty();
}

static const PxU8 gCodes[] = {	4, 4, 4, 4, 4, 3, 2, 2,
								4, 1, 0, 0, 4, 1, 0, 0,
								4, 1, 0, 0, 2, 1, 0, 0,
								3, 1, 0, 0, 2, 1, 0, 0};

#ifdef CAN_USE_MOVEMASK
/*static PX_FORCE_INLINE PxU32 classifyBox_x86(const BucketBox& box, const PxVec4& limits, const bool useY, const bool isCrossBucket)
{
	const Vec4V extents = AlignedLoad(&box.mExtents.x);
	const Vec4V center = AlignedLoad(&box.mCenter.x);
	const Vec4V plus = V4Add(extents, center);
	const Vec4V minus = V4Sub(extents, center);

	Vec4V tmp;
	if(useY)	// PT: this is a constant so branch prediction works here
		tmp = _mm_shuffle_ps(plus, minus, _MM_SHUFFLE(0,1,0,1));
	else
		tmp = _mm_shuffle_ps(plus, minus, _MM_SHUFFLE(0,2,0,2));

	const Vec4V comp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,2,1,3)); // oh well, nm

	const PxU32 Code = (PxU32)_mm_movemask_ps(V4IsGrtr(V4LoadA(&limits.x), comp));
	return gCodes[Code | PxU32(isCrossBucket)<<4];
}*/

static PX_FORCE_INLINE PxU32 classifyBox_x86(const Vec4V boxMin, const Vec4V boxMax, const PxVec4& limits, const bool useY, const bool isCrossBucket)
{
	const Vec4V plus = boxMax;
	const Vec4V minus = V4Neg(boxMin);

	Vec4V tmp;
	if(useY)	// PT: this is a constant so branch prediction works here
		tmp = _mm_shuffle_ps(plus, minus, _MM_SHUFFLE(0,1,0,1));
	else
		tmp = _mm_shuffle_ps(plus, minus, _MM_SHUFFLE(0,2,0,2));

	const Vec4V comp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,2,1,3)); // oh well, nm

	const PxU32 Code = PxU32(_mm_movemask_ps(V4IsGrtr(V4LoadA(&limits.x), comp)));
	return gCodes[Code | PxU32(isCrossBucket)<<4];
}
#endif

#ifdef CAN_USE_MOVEMASK
	#if PX_DEBUG
		#define USE_CLASSIFY_BOX
	#endif
#else
	#define USE_CLASSIFY_BOX
#endif

#ifdef USE_CLASSIFY_BOX
static PX_FORCE_INLINE PxU32 classifyBox(const BucketBox& box, const float limitX, const float limitYZ, const PxU32 yz, const bool isCrossBucket)
{
	const bool upperPart = (box.mCenter[yz] + box.mExtents[yz])<limitYZ;
	const bool lowerPart = (box.mCenter[yz] - box.mExtents[yz])>limitYZ;
	const bool leftPart = (box.mCenter.x + box.mExtents.x)<limitX;
	const bool rightPart = (box.mCenter.x - box.mExtents.x)>limitX;

	// Table-based box classification avoids many branches
	const PxU32 Code = PxU32(rightPart)|(PxU32(leftPart)<<1)|(PxU32(lowerPart)<<2)|(PxU32(upperPart)<<3);
	return gCodes[Code + (isCrossBucket ? 16 : 0)];
}
#endif

void BucketPrunerNode::classifyBoxes(	float limitX, float limitYZ,
										PxU32 nb, BucketBox* PX_RESTRICT boxes, const PrunerPayload* PX_RESTRICT objects,
										const PxTransform* PX_RESTRICT transforms,
										BucketBox* PX_RESTRICT sortedBoxes, PrunerPayload* PX_RESTRICT sortedObjects,
										PxTransform* PX_RESTRICT sortedTransforms,
										bool isCrossBucket, PxU32 sortAxis)
{
	const PxU32 yz = PxU32(sortAxis == 1 ? 2 : 1);

	#if PX_DEBUG
	{
		float prev = boxes[0].mDebugMin;
		for(PxU32 i=1;i<nb;i++)
		{
			const float current = boxes[i].mDebugMin;
			PX_ASSERT(current>=prev);
			prev = current;
		}
	}
	#endif

	// Local (stack-based) min/max bucket bounds
	PX_ALIGN(16, PxVec4) bucketBoxMin[5];
	PX_ALIGN(16, PxVec4) bucketBoxMax[5];
	{
		const PxBounds3 empty = PxBounds3::empty();
		for(PxU32 i=0;i<5;i++)
		{
			mCounters[i] = 0;
			bucketBoxMin[i] = PxVec4(empty.minimum, 0.0f);
			bucketBoxMax[i] = PxVec4(empty.maximum, 0.0f);
		}
	}

	{
#ifdef CAN_USE_MOVEMASK
		// DS: order doesn't play nice with x86 shuffles :-|
		PX_ALIGN(16, PxVec4) limits(-limitX, limitX, -limitYZ, limitYZ);
		const bool useY = yz==1;
#endif
		// Determine in which bucket each object falls, update bucket bounds
		for(PxU32 i=0;i<nb;i++)
		{
			const Vec4V boxCenterV = AlignedLoad(&boxes[i].mCenter.x);
			const Vec4V boxExtentsV = AlignedLoad(&boxes[i].mExtents.x);
			const Vec4V boxMinV = V4Sub(boxCenterV, boxExtentsV);
			const Vec4V boxMaxV = V4Add(boxCenterV, boxExtentsV);

#ifdef CAN_USE_MOVEMASK
//			const PxU32 index = classifyBox_x86(boxes[i], limits, useY, isCrossBucket);
			const PxU32 index = classifyBox_x86(boxMinV, boxMaxV, limits, useY, isCrossBucket);
	#if PX_DEBUG
			const PxU32 index_ = classifyBox(boxes[i], limitX, limitYZ, yz, isCrossBucket);
			PX_ASSERT(index == index_);
	#endif
#else
			const PxU32 index = classifyBox(boxes[i], limitX, limitYZ, yz, isCrossBucket);
#endif
			// Merge boxes
			{
				const Vec4V mergedMinV = V4Min(V4LoadA(&bucketBoxMin[index].x), boxMinV);
				const Vec4V mergedMaxV = V4Max(V4LoadA(&bucketBoxMax[index].x), boxMaxV);
				V4StoreA(mergedMinV, &bucketBoxMin[index].x);
				V4StoreA(mergedMaxV, &bucketBoxMax[index].x);
			}
			boxes[i].mData0 = index;	// Store bucket index for current box in this temporary location
			mCounters[index]++;
		}
	}

	{
		// Regenerate offsets
		mOffsets[0]=0;
		for(PxU32 i=0;i<4;i++)
			mOffsets[i+1] = mOffsets[i] + mCounters[i];
	}

	{
		// Group boxes with same bucket index together
		for(PxU32 i=0;i<nb;i++)
		{
			const PxU32 bucketOffset = mOffsets[boxes[i].mData0]++;	// Bucket index for current box was stored in mData0 by previous loop
			// The 2 following lines are the same as:
			// sortedBoxes[bucketOffset] = boxes[i];
			AlignedStore(AlignedLoad(&boxes[i].mCenter.x), &sortedBoxes[bucketOffset].mCenter.x);
			AlignedStore(AlignedLoad(&boxes[i].mExtents.x), &sortedBoxes[bucketOffset].mExtents.x);

	#if PX_DEBUG
			sortedBoxes[bucketOffset].mDebugMin = boxes[i].mDebugMin;
	#endif
			sortedObjects[bucketOffset] = objects[i];
			sortedTransforms[bucketOffset] = transforms[i];
		}
	}

	{
		// Regenerate offsets
		mOffsets[0]=0;
		for(PxU32 i=0;i<4;i++)
			mOffsets[i+1] = mOffsets[i] + mCounters[i];
	}

	{
		// Convert local (stack-based) min/max bucket bounds to persistent center/extents format
		const float Half = 0.5f;
		const FloatV HalfV = FLoad(Half);
		PX_ALIGN(16, PxVec4) bucketCenter;
		PX_ALIGN(16, PxVec4) bucketExtents;
		for(PxU32 i=0;i<5;i++)
		{
			// The following lines are the same as:
			// mBucketBox[i].mCenter = bucketBox[i].getCenter();
			// mBucketBox[i].mExtents = bucketBox[i].getExtents();
			const Vec4V bucketBoxMinV = V4LoadA(&bucketBoxMin[i].x);
			const Vec4V bucketBoxMaxV = V4LoadA(&bucketBoxMax[i].x);
			const Vec4V bucketBoxCenterV = V4Scale(V4Add(bucketBoxMaxV, bucketBoxMinV), HalfV);
			const Vec4V bucketBoxExtentsV = V4Scale(V4Sub(bucketBoxMaxV, bucketBoxMinV), HalfV);
			V4StoreA(bucketBoxCenterV, &bucketCenter.x);
			V4StoreA(bucketBoxExtentsV, &bucketExtents.x);
			mBucketBox[i].mCenter = PxVec3(bucketCenter.x, bucketCenter.y, bucketCenter.z);
			mBucketBox[i].mExtents = PxVec3(bucketExtents.x, bucketExtents.y, bucketExtents.z);
		}
	}

#if PX_DEBUG
	for(PxU32 j=0;j<5;j++)
	{
		const PxU32 count = mCounters[j];
		if(count)
		{
			const BucketBox* base = sortedBoxes + mOffsets[j];
			float prev = base[0].mDebugMin;
			for(PxU32 i=1;i<count;i++)
			{
				const float current = base[i].mDebugMin;
				PX_ASSERT(current>=prev);
				prev = current;
			}
		}
	}
#endif
}

///////////////////////////////////////////////////////////////////////////////

static void processChildBuckets(PxU32 nbAllocated,
								BucketBox* sortedBoxesInBucket, PrunerPayload* sortedObjectsInBucket,
								PxTransform* sortedTransformsInBucket,
								const BucketPrunerNode& bucket, BucketPrunerNode* PX_RESTRICT childBucket,
								BucketBox* PX_RESTRICT baseBucketsBoxes, PrunerPayload* PX_RESTRICT baseBucketsObjects,
								PxTransform* baseBucketTransforms,
								PxU32 sortAxis)
{
	PX_UNUSED(nbAllocated);

	const PxU32 yz = PxU32(sortAxis == 1 ? 2 : 1);
	for(PxU32 i=0;i<5;i++)
	{
		const PxU32 nbInBucket = bucket.mCounters[i];
		if(!nbInBucket)
		{
			childBucket[i].initCounters();
			continue;
		}
		BucketBox* bucketsBoxes = baseBucketsBoxes + bucket.mOffsets[i];
		PrunerPayload* bucketsObjects = baseBucketsObjects + bucket.mOffsets[i];
		PxTransform* bucketTransforms = baseBucketTransforms + bucket.mOffsets[i];
		PX_ASSERT(nbInBucket<=nbAllocated);

		const float limitX = bucket.mBucketBox[i].mCenter.x;
		const float limitYZ = bucket.mBucketBox[i].mCenter[yz];
		const bool isCrossBucket = i==4;
		childBucket[i].classifyBoxes(limitX, limitYZ, nbInBucket, bucketsBoxes, bucketsObjects,
			bucketTransforms,
			sortedBoxesInBucket, sortedObjectsInBucket,
			sortedTransformsInBucket,
			isCrossBucket, sortAxis);

		PxMemCopy(bucketsBoxes, sortedBoxesInBucket, sizeof(BucketBox)*nbInBucket);
		PxMemCopy(bucketsObjects, sortedObjectsInBucket, sizeof(PrunerPayload)*nbInBucket);
		PxMemCopy(bucketTransforms, sortedTransformsInBucket, sizeof(PxTransform)*nbInBucket);
	}
}

///////////////////////////////////////////////////////////////////////////////

static PX_FORCE_INLINE PxU32 encodeFloat(PxU32 newPos)
{
	//we may need to check on -0 and 0
	//But it should make no practical difference.
	if(newPos & PX_SIGN_BITMASK) //negative?
		return ~newPos;//reverse sequence of negative numbers
	else
		return newPos | PX_SIGN_BITMASK; // flip sign
}

static PX_FORCE_INLINE void computeRayLimits(float& rayMin, float& rayMax, const PxVec3& rayOrig, const PxVec3& rayDir, float maxDist, PxU32 sortAxis)
{
	const float rayOrigValue = rayOrig[sortAxis];
	const float rayDirValue = rayDir[sortAxis] * maxDist;
	rayMin = PxMin(rayOrigValue, rayOrigValue + rayDirValue);
	rayMax = PxMax(rayOrigValue, rayOrigValue + rayDirValue);
}

static PX_FORCE_INLINE void computeRayLimits(float& rayMin, float& rayMax, const PxVec3& rayOrig, const PxVec3& rayDir, float maxDist, const PxVec3& inflate, PxU32 sortAxis)
{
	const float inflateValue = inflate[sortAxis];
	const float rayOrigValue = rayOrig[sortAxis];
	const float rayDirValue = rayDir[sortAxis] * maxDist;
	rayMin = PxMin(rayOrigValue, rayOrigValue + rayDirValue) - inflateValue;
	rayMax = PxMax(rayOrigValue, rayOrigValue + rayDirValue) + inflateValue;
}

static PX_FORCE_INLINE void encodeBoxMinMax(BucketBox& box, const PxU32 axis)
{
	const float min = box.mCenter[axis] - box.mExtents[axis];
	const float max = box.mCenter[axis] + box.mExtents[axis];

	const PxU32* binaryMin = reinterpret_cast<const PxU32*>(&min);
	const PxU32* binaryMax = reinterpret_cast<const PxU32*>(&max);
	box.mData0 = encodeFloat(binaryMin[0]);
	box.mData1 = encodeFloat(binaryMax[0]);
}

///////////////////////////////////////////////////////////////////////////////

BucketPrunerCore::BucketPrunerCore(bool externalMemory) :
	mCoreNbObjects		(0),
	mCoreCapacity		(0),
	mCoreBoxes			(NULL),
	mCoreObjects		(NULL),
	mCoreTransforms		(NULL),
	mCoreRemap			(NULL),
	mSortedWorldBoxes	(NULL),
	mSortedObjects		(NULL),
	mSortedTransforms	(NULL),
#ifdef FREE_PRUNER_SIZE
	mNbFree				(0),
#endif
	mSortedNb			(0),
	mSortedCapacity		(0),
	mSortAxis			(0),
	mDirty				(false),
	mOwnMemory			(!externalMemory)
{
	mGlobalBox.setEmpty();

	mLevel1.initCounters();

	for(PxU32 i=0;i<5;i++)
		mLevel2[i].initCounters();
	for(PxU32 j=0;j<5;j++)
		for(PxU32 i=0;i<5;i++)
			mLevel3[j][i].initCounters();
}

BucketPrunerCore::~BucketPrunerCore()
{
	release();
}

void BucketPrunerCore::release()
{
	mDirty			= true;
	mCoreNbObjects	= 0;

	mCoreCapacity	= 0;
	if(mOwnMemory)
	{
		PX_FREE(mCoreBoxes);
		PX_FREE(mCoreObjects);
		PX_FREE(mCoreTransforms);
		PX_FREE(mCoreRemap);
	}

	PX_FREE(mSortedWorldBoxes);
	PX_FREE(mSortedObjects);
	PX_FREE(mSortedTransforms);
	mSortedNb = 0;
	mSortedCapacity = 0;

#ifdef FREE_PRUNER_SIZE
	mNbFree = 0;
#endif
#ifdef USE_REGULAR_HASH_MAP
	mMap.clear();
#else
	mMap.purge();
#endif
}

void BucketPrunerCore::setExternalMemory(PxU32 nbObjects, PxBounds3* boxes, PrunerPayload* objects, PxTransform* transforms)
{
	PX_ASSERT(!mOwnMemory);
	mCoreNbObjects	= nbObjects;
	mCoreBoxes		= boxes;
	mCoreObjects	= objects;
	mCoreTransforms	= transforms;
	mCoreRemap		= NULL;
}

void BucketPrunerCore::allocateSortedMemory(PxU32 nb)
{
	mSortedNb = nb;
	if(nb<=mSortedCapacity && (nb>=mSortedCapacity/2))
		return;

	const PxU32 capacity = PxNextPowerOfTwo(nb);
	mSortedCapacity = capacity;

	PxU32 bytesNeededForBoxes = capacity*sizeof(BucketBox);
	bytesNeededForBoxes = ALIGN16(bytesNeededForBoxes);

	PxU32 bytesNeededForObjects = capacity*sizeof(PrunerPayload);
	bytesNeededForObjects = ALIGN16(bytesNeededForObjects);

	// PT: TODO: I don't remember what this alignment is for, maybe we don't need it
	PxU32 bytesNeededForTransforms = capacity*sizeof(PxTransform);
	bytesNeededForTransforms = ALIGN16(bytesNeededForTransforms);

	PX_FREE(mSortedObjects);
	PX_FREE(mSortedWorldBoxes);
	PX_FREE(mSortedTransforms);

	mSortedWorldBoxes = reinterpret_cast<BucketBox*>(PX_ALLOC(bytesNeededForBoxes, "BucketPruner"));
	mSortedObjects = reinterpret_cast<PrunerPayload*>(PX_ALLOC(bytesNeededForObjects, "BucketPruner"));
	mSortedTransforms = reinterpret_cast<PxTransform*>(PX_ALLOC(bytesNeededForTransforms, "BucketPruner"));

	PX_ASSERT(!(size_t(mSortedWorldBoxes)&15));
	PX_ASSERT(!(size_t(mSortedObjects)&15));
	PX_ASSERT(!(size_t(mSortedTransforms)&15));
}

///////////////////////////////////////////////////////////////////////////////

void BucketPrunerCore::resizeCore()
{
	const PxU32 capacity = mCoreCapacity ? mCoreCapacity*2 : 32;
	mCoreCapacity = capacity;

	const PxU32 bytesNeededForBoxes = capacity*sizeof(PxBounds3);
	const PxU32 bytesNeededForObjects = capacity*sizeof(PrunerPayload);
	const PxU32 bytesNeededForTransforms = capacity*sizeof(PxTransform);
	const PxU32 bytesNeededForRemap = capacity*sizeof(PxU32);

	PxBounds3* newCoreBoxes = reinterpret_cast<PxBounds3*>(PX_ALLOC(bytesNeededForBoxes, "BucketPruner"));
	PrunerPayload* newCoreObjects = reinterpret_cast<PrunerPayload*>(PX_ALLOC(bytesNeededForObjects, "BucketPruner"));
	PxTransform* newCoreTransforms = reinterpret_cast<PxTransform*>(PX_ALLOC(bytesNeededForTransforms, "BucketPruner"));
	PxU32* newCoreRemap = reinterpret_cast<PxU32*>(PX_ALLOC(bytesNeededForRemap, "BucketPruner"));
	if(mCoreBoxes)
	{
		PxMemCopy(newCoreBoxes, mCoreBoxes, mCoreNbObjects*sizeof(PxBounds3));
		PX_FREE(mCoreBoxes);
	}
	if(mCoreObjects)
	{
		PxMemCopy(newCoreObjects, mCoreObjects, mCoreNbObjects*sizeof(PrunerPayload));
		PX_FREE(mCoreObjects);
	}
	if(mCoreTransforms)
	{
		PxMemCopy(newCoreTransforms, mCoreTransforms, mCoreNbObjects*sizeof(PxTransform));
		PX_FREE(mCoreTransforms);
	}
	if(mCoreRemap)
	{
		PxMemCopy(newCoreRemap, mCoreRemap, mCoreNbObjects*sizeof(PxU32));
		PX_FREE(mCoreRemap);
	}
	mCoreBoxes = newCoreBoxes;
	mCoreObjects = newCoreObjects;
	mCoreTransforms = newCoreTransforms;
	mCoreRemap = newCoreRemap;
}

PX_FORCE_INLINE void BucketPrunerCore::addObjectInternal(const PrunerPayload& object, const PxBounds3& worldAABB, const PxTransform& transform, PxU32 timeStamp)
{
	if(mCoreNbObjects==mCoreCapacity)
		resizeCore();

	const PxU32 index = mCoreNbObjects++;
	mCoreObjects[index] = object;
	mCoreBoxes[index] = worldAABB;	// PT: TODO: check assembly here
	mCoreTransforms[index] = transform;	// PT: TODO: check assembly here
	mCoreRemap[index] = 0xffffffff;

	// Objects are only inserted into the map once they're part of the main/core arrays.
#ifdef USE_REGULAR_HASH_MAP
	bool ok = mMap.insert(object, BucketPrunerPair(index, timeStamp));
#else
	BucketPrunerPair* ok = mMap.addPair(object, index, timeStamp);
#endif
	PX_UNUSED(ok);
	PX_ASSERT(ok);
}

bool BucketPrunerCore::addObject(const PrunerPayload& object, const PxBounds3& worldAABB, const PxTransform& transform, PxU32 timeStamp)
{
/*
	We should probably use a bigger payload/userData struct here, which would also contains the external handle.
	(EDIT: we can't even do that, because of the setExternalMemory function)
	When asked to update/remove an object it would be O(n) to find the proper object in the mSortedObjects array.

	-

	For removing it we can simply empty the corresponding box, and the object will never be returned from queries.
	Maybe this isn't even true, since boxes are sorted along one axis. So marking a box as empty could break the code relying on a sorted order.
	An alternative is to mark the external handle as invalid, and ignore the object when a hit is found.

	(EDIT: the sorting is now tested via data0/data1 anyway so we could mark the box as empty without breaking this)

	-

	For updating an object we would need to keep the (sub) array sorted (not the whole thing, only the array within a bucket).
	We don't know the range (what part of the array maps to our bucket) but we may have the bucket ID somewhere? If we'd have this
	we could parse the array left/right and resort just the right boxes. If we don't have this we may be able to "quickly" find the
	range by traversing the tree, looking for the proper bucket. In any case I don't think there's a mapping to update within a bucket,
	unlike in SAP or MBP. So we should be able to shuffle a bucket without having to update anything. For example there's no mapping
	between the Core array and the Sorted array. It's a shame in a way because we'd need one, but it's not there - and in fact I think
	we can free the Core array once Sorted is created, we don't need it at all.

	If we don't want to re-sort the full bucket we can just mark it as dirty and ignore the sort-based early exits in the queries. Then we
	can incrementally resort it over N frames or something.

	This only works if the updated object remains in the same bucket though. If it moves to another bucket it becomes tempting to just remove
	the object and re-insert it.

	-

	Now for adding an object, we can first have a "free pruner" and do the 16 next entries brute-force. Rebuilding every 16 objects might
	give a good speedup already. Otherwise we need to do something more complicated.
*/

	PX_ASSERT(mOwnMemory);
	PX_ASSERT(!mDirty || !mNbFree);
	if(!mDirty)
	{
#ifdef FREE_PRUNER_SIZE
		// In this path the structure is marked as valid. We do not want to invalidate it for each new object...
		if(mNbFree<FREE_PRUNER_SIZE)
		{
			// ...so as long as there is space in the "free array", we store the newly added object there and
			// return immediately. Subsequent queries will parse the free array as if it was a free pruner.
			const PxU32 index = mNbFree++;
			mFreeObjects[index] = object;
			mFreeBounds[index] = worldAABB;
			mFreeTransforms[index] = transform;
			mFreeStamps[index] = timeStamp;
			return true;
		}

		// If we reach this place, the free array is full. We must transfer the objects from the free array to
		// the main (core) arrays, mark the structure as invalid, and still deal with the incoming object.

		// First we transfer free objects, reset the number of free objects, and mark the structure as
		// invalid/dirty (the core arrays will need rebuilding).
		for(PxU32 i=0;i<mNbFree;i++)
			addObjectInternal(mFreeObjects[i], mFreeBounds[i], mFreeTransforms[i], mFreeStamps[i]);

		mNbFree = 0;
#endif
		mDirty = true;
//		mSortedNb = 0;	// PT: TODO: investigate if this should be done here

		// After that we still need to deal with the new incoming object (so far we only
		// transferred the already existing objects from the full free array). This will
		// happen automatically by letting the code continue to the regular codepath below.
	}

	// If we reach this place, the structure must be invalid and the incoming object
	// must be added to the main arrays.
	PX_ASSERT(mDirty);

	addObjectInternal(object, worldAABB, transform, timeStamp);
	return true;
}

bool BucketPrunerCore::removeObject(const PrunerPayload& object, PxU32& timeStamp)
{
	// Even if the structure is already marked as dirty, we still need to update the
	// core arrays and the map.

	// The map only contains core objects, so we can use it to determine if the object
	// exists in the core arrays or in the free array.
#ifdef USE_REGULAR_HASH_MAP
/*	BucketPrunerPair entry;
	if(mMap.findAndErase(object, entry))
	{
		PxU32 coreIndex = entry.mCoreIndex;
		timeStamp = entry.mTimeStamp;*/
	const BucketPrunerMap::Entry* removedEntry = mMap.find(object);
	if(removedEntry)
	{
		PxU32 coreIndex = removedEntry->second.mCoreIndex;
		timeStamp = removedEntry->second.mTimeStamp;
#else
	PxU32 coreIndex;	// This is the object's index in the core arrays.
	if(mMap.removePair(object, coreIndex, timeStamp))
	{
#endif
		// In this codepath, the object we want to remove exists in the core arrays.

		// We will need to remove it from both the core arrays & the sorted arrays.
		const PxU32 sortedIndex = mCoreRemap[coreIndex];	// This is the object's index in the sorted arrays.

#ifdef USE_REGULAR_HASH_MAP
		bool status = mMap.erase(object);
		PX_ASSERT(status);
		PX_UNUSED(status);
#endif

		// First let's deal with the core arrays
		mCoreNbObjects--;
		if(coreIndex!=mCoreNbObjects)
		{
			// If it wasn't the last object in the array, close the gaps as usual
			const PrunerPayload& movedObject = mCoreObjects[mCoreNbObjects];
			mCoreBoxes[coreIndex] = mCoreBoxes[mCoreNbObjects];
			mCoreTransforms[coreIndex] = mCoreTransforms[mCoreNbObjects];
			mCoreObjects[coreIndex] = movedObject;
			mCoreRemap[coreIndex] = mCoreRemap[mCoreNbObjects];

			// Since we just moved the last object, its index in the core arrays has changed.
			// We must reflect this change in the map.
#ifdef USE_REGULAR_HASH_MAP
			BucketPrunerMap::Entry* movedEntry = const_cast<BucketPrunerMap::Entry*>(mMap.find(movedObject));
			PX_ASSERT(movedEntry->second.mCoreIndex==mCoreNbObjects);
			movedEntry->second.mCoreIndex = coreIndex;
#else
			BucketPrunerPair* movedEntry = const_cast<BucketPrunerPair*>(mMap.findPair(movedObject));
			PX_ASSERT(movedEntry->mCoreIndex==mCoreNbObjects);
			movedEntry->mCoreIndex = coreIndex;
#endif
		}

		// Now, let's deal with the sorted arrays.
		// If the structure is dirty, the sorted arrays will be rebuilt from scratch so there's no need to
		// update them right now.
		if(!mDirty)
		{
			// If the structure is valid, we want to keep it this way to avoid rebuilding sorted arrays after
			// each removal. We can't "close the gaps" easily here because order of objects in the arrays matters.
			
			// Instead we just invalidate the object by setting its bounding box as empty.
			// Queries against empty boxes will never return a hit, so this effectively "removes" the object
			// from any subsequent query results. Sorted arrays now contain a "disabled" object, until next build.
			
			// Invalidating the box does not invalidate the sorting, since it's now captured in mData0/mData1.
			// That is, mData0/mData1 keep their previous integer-encoded values, as if the box/object was still here.
			mSortedWorldBoxes[sortedIndex].mCenter = PxVec3(0.0f);
			mSortedWorldBoxes[sortedIndex].mExtents = PxVec3(-GU_EMPTY_BOUNDS_EXTENTS);
			// Note that we don't touch mSortedObjects here. We could, but this is not necessary.
		}
		return true;
	}

#ifdef FREE_PRUNER_SIZE
	// Here, the object we want to remove exists in the free array. So we just parse it.
	for(PxU32 i=0;i<mNbFree;i++)
	{
		if(mFreeObjects[i]==object)
		{
			// We found the object we want to remove. Close the gap as usual.
			timeStamp = mFreeStamps[i];
			mNbFree--;
			mFreeBounds[i] = mFreeBounds[mNbFree];
			mFreeTransforms[i] = mFreeTransforms[mNbFree];
			mFreeObjects[i] = mFreeObjects[mNbFree];
			mFreeStamps[i] = mFreeStamps[mNbFree];
			return true;
		}
	}
#endif
	// We didn't find the object. Can happen with a double remove. PX_ASSERT might be an option here.
	return false;
}

bool BucketPrunerCore::updateObject(const PxBounds3& worldAABB, const PrunerPayload& object, const PxTransform& transform)
{
	PxU32 timeStamp;
	if(!removeObject(object, timeStamp))
		return false;

	return addObject(object, worldAABB, transform, timeStamp);
}

PxU32 BucketPrunerCore::removeMarkedObjects(PxU32 timeStamp)
{
	PxU32 nbRemoved=0;
	// PT: objects can be either in the hash-map, or in the 'free' array. First we look in the hash-map...
#ifdef USE_REGULAR_HASH_MAP
	if(mMap.size())
#else
	if(mMap.mNbActivePairs)
#endif
	{
		PxBounds3 empty;
		empty.setEmpty();
		const PxVec3 emptyCenter = empty.getCenter();
		const PxVec3 emptyExtents = empty.getExtents();

		// PT: hash-map is coalesced so we just parse it in linear order, no holes
		PxU32 i=0;
#ifdef USE_REGULAR_HASH_MAP
		PxU32 nbActivePairs = mMap.size();
		const BucketPrunerMap::Entry* entries = mMap.mBase.getEntries();
#else
		PxU32 nbActivePairs = mMap.mNbActivePairs;
#endif
		PxU32 coreNbObjects = mCoreNbObjects;	// PT: to avoid LHS
		while(i<nbActivePairs)
		{
#ifdef USE_REGULAR_HASH_MAP
			const BucketPrunerMap::Entry& p = entries[i];
			if(p.second.mTimeStamp==timeStamp)
#else
			const BucketPrunerPair& p = mMap.mActivePairs[i];
			if(p.mTimeStamp==timeStamp)
#endif
			{
				// PT: timestamps match. We must remove this object.
				// PT: we replicate here what we do in BucketPrunerCore::removeObject(). See that function for details.

#ifdef USE_REGULAR_HASH_MAP
				const PxU32 coreIndex = p.second.mCoreIndex;
#else
				const PxU32 coreIndex = p.mCoreIndex;
#endif
				if(!mDirty)
				{
					// PT: invalidating the box does not invalidate the sorting, since it's now captured in mData0/mData1
					const PxU32 sortedIndex = mCoreRemap[coreIndex];
					mSortedWorldBoxes[sortedIndex].mCenter = emptyCenter;
					mSortedWorldBoxes[sortedIndex].mExtents = emptyExtents;
				}

				coreNbObjects--;
				if(coreIndex!=coreNbObjects)
				{
					const PrunerPayload& movedObject = mCoreObjects[coreNbObjects];
					mCoreBoxes[coreIndex] = mCoreBoxes[coreNbObjects];
					mCoreTransforms[coreIndex] = mCoreTransforms[coreNbObjects];
					mCoreObjects[coreIndex] = movedObject;
					mCoreRemap[coreIndex] = mCoreRemap[coreNbObjects];

#ifdef USE_REGULAR_HASH_MAP
					BucketPrunerMap::Entry* movedEntry = const_cast<BucketPrunerMap::Entry*>(mMap.find(movedObject));
					PX_ASSERT(movedEntry->second.mCoreIndex==coreNbObjects);
					movedEntry->second.mCoreIndex = coreIndex;
#else
					BucketPrunerPair* movedEntry = const_cast<BucketPrunerPair*>(mMap.findPair(movedObject));
					PX_ASSERT(movedEntry->mCoreIndex==coreNbObjects);
					movedEntry->mCoreIndex = coreIndex;
#endif
				}

				nbRemoved++;
#ifdef USE_REGULAR_HASH_MAP
				bool status = mMap.erase(p.first);
				PX_ASSERT(status);
				PX_UNUSED(status);
#else
				const PxU32 hashValue = PxComputeHash(p.mData) & mMap.mMask;
				mMap.removePairInternal(p.mData, hashValue, i);
#endif
				nbActivePairs--;
			}
			else i++;
		}
		mCoreNbObjects = coreNbObjects;

#ifdef USE_REGULAR_HASH_MAP
#else
		mMap.shrinkMemory();
#endif
	}

#ifdef FREE_PRUNER_SIZE
	// PT: ...then we look in the 'free' array
	PxU32 i=0;
	while(i<mNbFree)
	{
		if(mFreeStamps[i]==timeStamp)
		{
			nbRemoved++;
			mNbFree--;
			mFreeBounds[i] = mFreeBounds[mNbFree];
			mFreeTransforms[i] = mFreeTransforms[mNbFree];
			mFreeObjects[i] = mFreeObjects[mNbFree];
			mFreeStamps[i] = mFreeStamps[mNbFree];
		}
		else i++;
	}
#endif
	return nbRemoved;
}

///////////////////////////////////////////////////////////////////////////////

static PxU32 sortBoxes(	PxU32 nb, const PxBounds3* PX_RESTRICT boxes, const PrunerPayload* PX_RESTRICT objects,
						const PxTransform* PX_RESTRICT transforms,
						BucketBox& _globalBox, BucketBox* PX_RESTRICT sortedBoxes, PrunerPayload* PX_RESTRICT sortedObjects
						, PxTransform* PX_RESTRICT sortedTransforms)
{
	// Compute global box & sort axis
	PxU32 sortAxis;
	{
		PX_ASSERT(nb>0);
		Vec4V mergedMinV = V4LoadU(&boxes[nb-1].minimum.x);
		Vec4V mergedMaxV = Vec4V_From_Vec3V(V3LoadU(&boxes[nb-1].maximum.x));
		for(PxU32 i=0;i<nb-1;i++)
		{
			mergedMinV = V4Min(mergedMinV, V4LoadU(&boxes[i].minimum.x));
			mergedMaxV = V4Max(mergedMaxV, V4LoadU(&boxes[i].maximum.x));
		}

/*		PX_ALIGN(16, PxVec4) mergedMin;
		PX_ALIGN(16, PxVec4) mergedMax;
		V4StoreA(mergedMinV, &mergedMin.x);
		V4StoreA(mergedMaxV, &mergedMax.x);

		_globalBox.mCenter.x = (mergedMax.x + mergedMin.x)*0.5f;
		_globalBox.mCenter.y = (mergedMax.y + mergedMin.y)*0.5f;
		_globalBox.mCenter.z = (mergedMax.z + mergedMin.z)*0.5f;
		_globalBox.mExtents.x = (mergedMax.x - mergedMin.x)*0.5f;
		_globalBox.mExtents.y = (mergedMax.y - mergedMin.y)*0.5f;
		_globalBox.mExtents.z = (mergedMax.z - mergedMin.z)*0.5f;*/

			const float Half = 0.5f;
			const FloatV HalfV = FLoad(Half);
			PX_ALIGN(16, PxVec4) mergedCenter;
			PX_ALIGN(16, PxVec4) mergedExtents;

			const Vec4V mergedCenterV = V4Scale(V4Add(mergedMaxV, mergedMinV), HalfV);
			const Vec4V mergedExtentsV = V4Scale(V4Sub(mergedMaxV, mergedMinV), HalfV);
			V4StoreA(mergedCenterV, &mergedCenter.x);
			V4StoreA(mergedExtentsV, &mergedExtents.x);
			_globalBox.mCenter = PxVec3(mergedCenter.x, mergedCenter.y, mergedCenter.z);
			_globalBox.mExtents = PxVec3(mergedExtents.x, mergedExtents.y, mergedExtents.z);

		const PxF32 absY = PxAbs(_globalBox.mExtents.y);
		const PxF32 absZ = PxAbs(_globalBox.mExtents.z);
		sortAxis = PxU32(absY < absZ ? 1 : 2);
//		printf("Sort axis: %d\n", sortAxis);
	}

	float* keys = reinterpret_cast<float*>(sortedObjects);
	for(PxU32 i=0;i<nb;i++)
		keys[i] = boxes[i].minimum[sortAxis];

	Cm::RadixSortBuffered rs;	// ###TODO: some allocs here, remove
	const PxU32* ranks = rs.Sort(keys, nb).GetRanks();

	const float Half = 0.5f;
	const FloatV HalfV = FLoad(Half);
	for(PxU32 i=0;i<nb;i++)
	{
		const PxU32 index = *ranks++;
//const PxU32 index = local[i].index;
//		sortedBoxes[i].mCenter = boxes[index].getCenter();
//		sortedBoxes[i].mExtents = boxes[index].getExtents();

		const Vec4V bucketBoxMinV = V4LoadU(&boxes[index].minimum.x);
		const Vec4V bucketBoxMaxV = Vec4V_From_Vec3V(V3LoadU(&boxes[index].maximum.x));
		const Vec4V bucketBoxCenterV = V4Scale(V4Add(bucketBoxMaxV, bucketBoxMinV), HalfV);
		const Vec4V bucketBoxExtentsV = V4Scale(V4Sub(bucketBoxMaxV, bucketBoxMinV), HalfV);
		// We don't need to preserve data0/data1 here
		AlignedStore(bucketBoxCenterV, &sortedBoxes[i].mCenter.x);
		AlignedStore(bucketBoxExtentsV, &sortedBoxes[i].mExtents.x);

	#if PX_DEBUG
		sortedBoxes[i].mDebugMin = boxes[index].minimum[sortAxis];
	#endif
		sortedObjects[i] = objects[index];
		sortedTransforms[i] = transforms[index];
	}

	return sortAxis;
}

#ifdef NODE_SORT
	template<class T>
	PX_CUDA_CALLABLE PX_FORCE_INLINE void tswap(T& x, T& y)
	{
		T tmp = x;
		x = y;
		y = tmp;
	}

/*	PX_FORCE_INLINE __m128 DotV(const __m128 a, const __m128 b)	
	{
		const __m128 dot1 = _mm_mul_ps(a, b);
		const __m128 shuf1 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(dot1), _MM_SHUFFLE(0,0,0,0)));
		const __m128 shuf2 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(dot1), _MM_SHUFFLE(1,1,1,1)));
		const __m128 shuf3 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(dot1), _MM_SHUFFLE(2,2,2,2)));
		return _mm_add_ps(_mm_add_ps(shuf1, shuf2), shuf3);
	}*/

// PT: hmmm, by construction, isn't the order always the same for all bucket pruners?
// => maybe not because the bucket boxes are still around the merged aabbs, not around the bucket
// Still we could do something here
static /*PX_FORCE_INLINE*/ PxU32 sort(const BucketPrunerNode& parent, const PxVec3& rayDir)
{
	const PxU32 totalCount = parent.mCounters[0]+parent.mCounters[1]+parent.mCounters[2]+parent.mCounters[3]+parent.mCounters[4];
	if(totalCount<NODE_SORT_MIN_COUNT)
		return 0|(1<<3)|(2<<6)|(3<<9)|(4<<12);

	float dp[5];
/*	const __m128 rayDirV = _mm_loadu_ps(&rayDir.x);
	__m128 dp0V = DotV(rayDirV, _mm_loadu_ps(&parent.mBucketBox[0].mCenter.x));	_mm_store_ss(&dp[0], dp0V);
	__m128 dp1V = DotV(rayDirV, _mm_loadu_ps(&parent.mBucketBox[1].mCenter.x));	_mm_store_ss(&dp[1], dp1V);
	__m128 dp2V = DotV(rayDirV, _mm_loadu_ps(&parent.mBucketBox[2].mCenter.x));	_mm_store_ss(&dp[2], dp2V);
	__m128 dp3V = DotV(rayDirV, _mm_loadu_ps(&parent.mBucketBox[3].mCenter.x));	_mm_store_ss(&dp[3], dp3V);
	__m128 dp4V = DotV(rayDirV, _mm_loadu_ps(&parent.mBucketBox[4].mCenter.x));	_mm_store_ss(&dp[4], dp4V);
*/

#ifdef VERIFY_SORT
	PxU32 code;
	{
		dp[0] = parent.mCounters[0] ? PxAbs(parent.mBucketBox[0].mCenter.dot(rayDir)) : PX_MAX_F32;
		dp[1] = parent.mCounters[1] ? PxAbs(parent.mBucketBox[1].mCenter.dot(rayDir)) : PX_MAX_F32;
		dp[2] = parent.mCounters[2] ? PxAbs(parent.mBucketBox[2].mCenter.dot(rayDir)) : PX_MAX_F32;
		dp[3] = parent.mCounters[3] ? PxAbs(parent.mBucketBox[3].mCenter.dot(rayDir)) : PX_MAX_F32;
		dp[4] = parent.mCounters[4] ? PxAbs(parent.mBucketBox[4].mCenter.dot(rayDir)) : PX_MAX_F32;

		PxU32 ii0 = 0;
		PxU32 ii1 = 1;
		PxU32 ii2 = 2;
		PxU32 ii3 = 3;
		PxU32 ii4 = 4;

		// PT: using integer cmps since we used fabsf above
	//	const PxU32* values = reinterpret_cast<const PxU32*>(dp);
		const PxU32* values = PxUnionCast<PxU32*, PxF32*>(dp);

		PxU32 value0 = values[0];
		PxU32 value1 = values[1];
		PxU32 value2 = values[2];
		PxU32 value3 = values[3];
		PxU32 value4 = values[4];

		for(PxU32 j=0;j<5-1;j++)
		{
			if(value1<value0)
			{
				tswap(value0, value1);
				tswap(ii0, ii1);
			}
			if(value2<value1)
			{
				tswap(value1, value2);
				tswap(ii1, ii2);
			}
			if(value3<value2)
			{
				tswap(value2, value3);
				tswap(ii2, ii3);
			}
			if(value4<value3)
			{
				tswap(value3, value4);
				tswap(ii3, ii4);
			}
		}
		//return ii0|(ii1<<3)|(ii2<<6)|(ii3<<9)|(ii4<<12);
		code = ii0|(ii1<<3)|(ii2<<6)|(ii3<<9)|(ii4<<12);
	}
#endif

	dp[0] = parent.mCounters[0] ? parent.mBucketBox[0].mCenter.dot(rayDir) : PX_MAX_F32;
	dp[1] = parent.mCounters[1] ? parent.mBucketBox[1].mCenter.dot(rayDir) : PX_MAX_F32;
	dp[2] = parent.mCounters[2] ? parent.mBucketBox[2].mCenter.dot(rayDir) : PX_MAX_F32;
	dp[3] = parent.mCounters[3] ? parent.mBucketBox[3].mCenter.dot(rayDir) : PX_MAX_F32;
	dp[4] = parent.mCounters[4] ? parent.mBucketBox[4].mCenter.dot(rayDir) : PX_MAX_F32;

	const PxU32* values = PxUnionCast<PxU32*, PxF32*>(dp);

//	const PxU32 mask = ~7U;
	const PxU32 mask = 0x7ffffff8;
	PxU32 value0 = (values[0]&mask);
	PxU32 value1 = (values[1]&mask)|1;
	PxU32 value2 = (values[2]&mask)|2;
	PxU32 value3 = (values[3]&mask)|3;
	PxU32 value4 = (values[4]&mask)|4;

#define SORT_BLOCK								\
	if(value1<value0)	tswap(value0, value1);	\
	if(value2<value1)	tswap(value1, value2);	\
	if(value3<value2)	tswap(value2, value3);	\
	if(value4<value3)	tswap(value3, value4);
	SORT_BLOCK
	SORT_BLOCK
	SORT_BLOCK
	SORT_BLOCK

	const PxU32 ii0 = value0&7;
	const PxU32 ii1 = value1&7;
	const PxU32 ii2 = value2&7;
	const PxU32 ii3 = value3&7;
	const PxU32 ii4 = value4&7;
	const PxU32 code2 = ii0|(ii1<<3)|(ii2<<6)|(ii3<<9)|(ii4<<12);
#ifdef VERIFY_SORT
	PX_ASSERT(code2==code);
#endif
	return code2;
}

static void gPrecomputeSort(BucketPrunerNode& node, const PxVec3* PX_RESTRICT dirs)
{
	for(int i=0;i<8;i++)
		node.mOrder[i] = PxTo16(sort(node, dirs[i]));
}
#endif

void BucketPrunerCore::classifyBoxes()
{
	if(!mDirty)
		return;

	mDirty = false;

	const PxU32 nb = mCoreNbObjects;
	if(!nb)
	{
		mSortedNb=0;
		return;
	}

	PX_ASSERT(!mNbFree);

#ifdef BRUTE_FORCE_LIMIT
	if(nb<=BRUTE_FORCE_LIMIT)
	{
		allocateSortedMemory(nb);
		BucketBox* sortedBoxes = mSortedWorldBoxes;
		PrunerPayload* sortedObjects = mSortedObjects;

		const float Half = 0.5f;
		const __m128 HalfV = _mm_load1_ps(&Half);
		PX_ALIGN(16, PxVec4) bucketCenter;
		PX_ALIGN(16, PxVec4) bucketExtents;
		for(PxU32 i=0;i<nb;i++)
		{
			const __m128 bucketBoxMinV = _mm_loadu_ps(&mCoreBoxes[i].minimum.x);
			const __m128 bucketBoxMaxV = _mm_loadu_ps(&mCoreBoxes[i].maximum.x);
			const __m128 bucketBoxCenterV = _mm_mul_ps(_mm_add_ps(bucketBoxMaxV, bucketBoxMinV), HalfV);
			const __m128 bucketBoxExtentsV = _mm_mul_ps(_mm_sub_ps(bucketBoxMaxV, bucketBoxMinV), HalfV);
			_mm_store_ps(&bucketCenter.x, bucketBoxCenterV);
			_mm_store_ps(&bucketExtents.x, bucketBoxExtentsV);
			sortedBoxes[i].mCenter = PxVec3(bucketCenter.x, bucketCenter.y, bucketCenter.z);
			sortedBoxes[i].mExtents = PxVec3(bucketExtents.x, bucketExtents.y, bucketExtents.z);

			sortedObjects[i] = mCoreObjects[i];
		}
		return;
	}
#endif


size_t* remap = reinterpret_cast<size_t*>(PX_ALLOC(nb*sizeof(size_t), ""));
for(PxU32 i=0;i<nb;i++)
{
	remap[i] = mCoreObjects[i].data[0];
	mCoreObjects[i].data[0] = i;
}

//	printf("Nb objects: %d\n", nb);

	PrunerPayload localTempObjects[LOCAL_SIZE];
	BucketBox localTempBoxes[LOCAL_SIZE];
	PxTransform localTempTransforms[LOCAL_SIZE];
	PrunerPayload* tempObjects;
	PxTransform* tempTransforms;
	BucketBox* tempBoxes;
	if(nb>LOCAL_SIZE)
	{
		tempObjects = PX_ALLOCATE(PrunerPayload, nb, "BucketPruner");
		tempBoxes = PX_ALLOCATE(BucketBox, nb, "BucketPruner");
		tempTransforms = PX_ALLOCATE(PxTransform, nb, "BucketPruner");
	}
	else
	{
		tempObjects = localTempObjects;
		tempBoxes = localTempBoxes;
		tempTransforms = localTempTransforms;
	}

	mSortAxis = sortBoxes(nb, mCoreBoxes, mCoreObjects, mCoreTransforms, mGlobalBox, tempBoxes, tempObjects, tempTransforms);

	PX_ASSERT(mSortAxis);

	allocateSortedMemory(nb);
	BucketBox* sortedBoxes = mSortedWorldBoxes;
	PrunerPayload* sortedObjects = mSortedObjects;
	PxTransform* sortedTransforms = mSortedTransforms;

	const PxU32 yz = PxU32(mSortAxis == 1 ? 2 : 1);
	const float limitX = mGlobalBox.mCenter.x;
	const float limitYZ = mGlobalBox.mCenter[yz];
	mLevel1.classifyBoxes(limitX, limitYZ, nb, tempBoxes, tempObjects, tempTransforms, sortedBoxes, sortedObjects, sortedTransforms, false, mSortAxis);

	processChildBuckets(nb, tempBoxes, tempObjects, tempTransforms, mLevel1, mLevel2, mSortedWorldBoxes, mSortedObjects, mSortedTransforms, mSortAxis);

	for(PxU32 j=0;j<5;j++)
		processChildBuckets(nb, tempBoxes, tempObjects, tempTransforms, mLevel2[j], mLevel3[j], mSortedWorldBoxes + mLevel1.mOffsets[j], mSortedObjects + mLevel1.mOffsets[j], mSortedTransforms + mLevel1.mOffsets[j], mSortAxis);

	{
		for(PxU32 i=0;i<nb;i++)
		{
			encodeBoxMinMax(mSortedWorldBoxes[i], mSortAxis);
		}
	}

	if(nb>LOCAL_SIZE)
	{
		PX_FREE(tempTransforms);
		PX_FREE(tempBoxes);
		PX_FREE(tempObjects);
	}

for(PxU32 i=0;i<nb;i++)
{
	const PxU32 coreIndex = PxU32(mSortedObjects[i].data[0]);
	const size_t saved = remap[coreIndex];
	mSortedObjects[i].data[0] = saved;
	mCoreObjects[coreIndex].data[0] = saved;
	if(mCoreRemap)
		mCoreRemap[coreIndex] = i;
//	remap[i] = mCoreObjects[i].data[0];
//	mCoreObjects[i].data[0] = i;
}
PX_FREE(remap);

/*	if(mOwnMemory)
	{
		PX_FREE(mCoreBoxes);
		PX_FREE(mCoreObjects);
	}*/


#ifdef NODE_SORT
	{
		PxVec3 dirs[8];
		dirs[0] = PxVec3(1.0f, 1.0f, 1.0f);
		dirs[1] = PxVec3(1.0f, 1.0f, -1.0f);
		dirs[2] = PxVec3(1.0f, -1.0f, 1.0f);
		dirs[3] = PxVec3(1.0f, -1.0f, -1.0f);
		dirs[4] = PxVec3(-1.0f, 1.0f, 1.0f);
		dirs[5] = PxVec3(-1.0f, 1.0f, -1.0f);
		dirs[6] = PxVec3(-1.0f, -1.0f, 1.0f);
		dirs[7] = PxVec3(-1.0f, -1.0f, -1.0f);
		for(int i=0;i<8;i++)
			dirs[i].normalize();

		gPrecomputeSort(mLevel1, dirs);

		for(PxU32 i=0;i<5;i++)
			gPrecomputeSort(mLevel2[i], dirs);

		for(PxU32 j=0;j<5;j++)
		{
			for(PxU32 i=0;i<5;i++)
				gPrecomputeSort(mLevel3[j][i], dirs);
		}
	}
#endif
}

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

#ifdef CAN_USE_MOVEMASK
namespace
{
	struct RayParams
	{
		PX_ALIGN(16,	PxVec3	mData2);	float	padding0;
		PX_ALIGN(16,	PxVec3	mFDir);		float	padding1;
		PX_ALIGN(16,	PxVec3	mData);		float	padding2;
		PX_ALIGN(16,	PxVec3	mInflate);	float	padding3;
	};
}
	static PX_FORCE_INLINE void precomputeRayData(RayParams* PX_RESTRICT rayParams, const PxVec3& rayOrig, const PxVec3& rayDir, float maxDist)
	{
	#ifdef USE_SIMD
		const float Half = 0.5f * maxDist;
		const __m128 HalfV = _mm_load1_ps(&Half);
		const __m128 DataV = _mm_mul_ps(_mm_loadu_ps(&rayDir.x), HalfV);
		const __m128 Data2V = _mm_add_ps(_mm_loadu_ps(&rayOrig.x), DataV);
		const PxU32 MaskI = 0x7fffffff;
		const __m128 FDirV = _mm_and_ps(_mm_load1_ps(reinterpret_cast<const float*>(&MaskI)), DataV);
		_mm_store_ps(&rayParams->mData.x, DataV);
		_mm_store_ps(&rayParams->mData2.x, Data2V);
		_mm_store_ps(&rayParams->mFDir.x, FDirV);
	#else
		const PxVec3 data = 0.5f * rayDir * maxDist;
		rayParams->mData = data;
		rayParams->mData2 = rayOrig + data;
		rayParams->mFDir.x = PxAbs(data.x);
		rayParams->mFDir.y = PxAbs(data.y);
		rayParams->mFDir.z = PxAbs(data.z);
	#endif
	}

	template <int inflateT>
	static PX_FORCE_INLINE PxIntBool segmentAABB(const BucketBox& box, const RayParams* PX_RESTRICT params)
	{
	#ifdef USE_SIMD
		const PxU32 maskI = 0x7fffffff;
		const __m128 fdirV = _mm_load_ps(&params->mFDir.x);
//		#ifdef _DEBUG
		const __m128 extentsV = inflateT ? _mm_add_ps(_mm_loadu_ps(&box.mExtents.x), _mm_load_ps(&params->mInflate.x)) : _mm_loadu_ps(&box.mExtents.x);
		const __m128 DV = _mm_sub_ps(_mm_load_ps(&params->mData2.x), _mm_loadu_ps(&box.mCenter.x));
/*		#else
		const __m128 extentsV = inflateT ? _mm_add_ps(_mm_load_ps(&box.mExtents.x), _mm_load_ps(&params->mInflate.x)) : _mm_load_ps(&box.mExtents.x);
		const __m128 DV = _mm_sub_ps(_mm_load_ps(&params->mData2.x), _mm_load_ps(&box.mCenter.x));
		#endif*/
		__m128 absDV = _mm_and_ps(DV, _mm_load1_ps(reinterpret_cast<const float*>(&maskI)));
		absDV = _mm_cmpgt_ps(absDV, _mm_add_ps(extentsV, fdirV));
		const PxU32 test = PxU32(_mm_movemask_ps(absDV));
		if(test&7)
			return 0;

		const __m128 dataZYX_V = _mm_load_ps(&params->mData.x);
		const __m128 dataXZY_V = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(dataZYX_V), _MM_SHUFFLE(3,0,2,1)));
		const __m128 DXZY_V = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(DV), _MM_SHUFFLE(3,0,2,1)));
		const __m128 fV = _mm_sub_ps(_mm_mul_ps(dataZYX_V, DXZY_V), _mm_mul_ps(dataXZY_V, DV));

		const __m128 fdirZYX_V = _mm_load_ps(&params->mFDir.x);
		const __m128 fdirXZY_V = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(fdirZYX_V), _MM_SHUFFLE(3,0,2,1)));
		const __m128 extentsXZY_V = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(extentsV), _MM_SHUFFLE(3,0,2,1)));
		const __m128 fg = _mm_add_ps(_mm_mul_ps(extentsV, fdirXZY_V), _mm_mul_ps(extentsXZY_V, fdirZYX_V));

		__m128 absfV = _mm_and_ps(fV, _mm_load1_ps(reinterpret_cast<const float*>(&maskI)));
		absfV = _mm_cmpgt_ps(absfV, fg);
		const PxU32 test2 = PxU32(_mm_movemask_ps(absfV));
		if(test2&7)
			return 0;
		return 1;
	#else
		const float boxExtentsx = inflateT ? box.mExtents.x + params->mInflate.x : box.mExtents.x;
		const float Dx = params->mData2.x - box.mCenter.x;	if(fabsf(Dx) > boxExtentsx + params->mFDir.x)	return PxIntFalse;

		const float boxExtentsz = inflateT ? box.mExtents.z + params->mInflate.z : box.mExtents.z;
		const float Dz = params->mData2.z - box.mCenter.z;	if(fabsf(Dz) > boxExtentsz + params->mFDir.z)	return PxIntFalse;

		const float boxExtentsy = inflateT ? box.mExtents.y + params->mInflate.y : box.mExtents.y;
		const float Dy = params->mData2.y - box.mCenter.y;	if(fabsf(Dy) > boxExtentsy + params->mFDir.y)	return PxIntFalse;

		float f;
		f = params->mData.y * Dz - params->mData.z * Dy;	if(fabsf(f) > boxExtentsy*params->mFDir.z + boxExtentsz*params->mFDir.y)	return PxIntFalse;
		f = params->mData.z * Dx - params->mData.x * Dz;	if(fabsf(f) > boxExtentsx*params->mFDir.z + boxExtentsz*params->mFDir.x)	return PxIntFalse;
		f = params->mData.x * Dy - params->mData.y * Dx;	if(fabsf(f) > boxExtentsx*params->mFDir.y + boxExtentsy*params->mFDir.x)	return PxIntFalse;
		return PxIntTrue;
	#endif
	}
#else
	#include "GuBVHTestsSIMD.h"

	typedef RayAABBTest BPRayAABBTest;

template <int inflateT>
static PX_FORCE_INLINE PxIntBool segmentAABB(const BucketBox& box, const BPRayAABBTest& test)
{
	return static_cast<PxIntBool>(test.check<inflateT>(V3LoadU(box.mCenter), V3LoadU(box.mExtents)));
}

/*static PX_FORCE_INLINE IntBool segmentAABB(const BucketBox& box, const BPRayAABBTest& test, PxU32 rayMinLimitX, PxU32 rayMaxLimitX)
{
	if(rayMinLimitX>box.mData1)
		return 0;
	if(rayMaxLimitX<box.mData0)
		return 0;

	return test(Vec3V_From_PxVec3(box.mCenter), Vec3V_From_PxVec3(box.mExtents));
}*/
#endif

namespace
{
	struct BucketPrunerRaycastAdapter
	{
		PX_FORCE_INLINE	BucketPrunerRaycastAdapter(PrunerRaycastCallback& pcb, const PrunerPayload* payloads, const PxTransform* transforms) :
			mCallback(pcb), mPayloads(payloads), mTransforms(transforms)	{}

		PX_FORCE_INLINE bool	invoke(PxReal& distance, PxU32 primIndex)
		{
			return mCallback.invoke(distance, primIndex, mPayloads, mTransforms);
		}

		PrunerRaycastCallback&	mCallback;
		const PrunerPayload*	mPayloads;
		const PxTransform*		mTransforms;
		PX_NOCOPY(BucketPrunerRaycastAdapter)
	};

	struct BucketPrunerOverlapAdapter
	{
		PX_FORCE_INLINE	BucketPrunerOverlapAdapter(PrunerOverlapCallback& pcb, const PrunerPayload* payloads, const PxTransform* transforms) :
			mCallback(pcb), mPayloads(payloads), mTransforms(transforms)	{}

		PX_FORCE_INLINE bool	invoke(PxU32 primIndex)
		{
			return mCallback.invoke(primIndex, mPayloads, mTransforms);
		}

		PrunerOverlapCallback&	mCallback;
		const PrunerPayload*	mPayloads;
		const PxTransform*		mTransforms;
		PX_NOCOPY(BucketPrunerOverlapAdapter)
	};

}

template <int inflateT>
static bool processBucket(
	PxU32 nb, const BucketBox* PX_RESTRICT baseBoxes, const PrunerPayload* PX_RESTRICT baseObjects,
		const PxTransform* PX_RESTRICT baseTransforms, PxU32 offset, PxU32 totalAllocated,
	const PxVec3& rayOrig, const PxVec3& rayDir, float& maxDist,
#ifdef CAN_USE_MOVEMASK
	RayParams* PX_RESTRICT rayParams,
#else
	BPRayAABBTest& test, const PxVec3& inflate,
#endif
	PrunerRaycastCallback& pcbArgName, PxU32& _rayMinLimitInt, PxU32& _rayMaxLimitInt, PxU32 sortAxis)
{
	PX_UNUSED(totalAllocated);

	const BucketBox* PX_RESTRICT _boxes = baseBoxes + offset;
	BucketPrunerRaycastAdapter pcb(pcbArgName, baseObjects + offset, baseTransforms + offset);

	PxU32 rayMinLimitInt = _rayMinLimitInt;
	PxU32 rayMaxLimitInt = _rayMaxLimitInt;

	const BucketBox* last = _boxes + nb;

	PxU32 objectID = 0;

	while(_boxes!=last)
	{
		const BucketBox& currentBox = *_boxes++;
		const PxU32 currentID = objectID++;

		if(currentBox.mData1<rayMinLimitInt)
			continue;

		if(currentBox.mData0>rayMaxLimitInt)
			goto Exit;

#ifdef CAN_USE_MOVEMASK
		if(!segmentAABB<inflateT>(currentBox, rayParams))
			continue;
#else
		if(!segmentAABB<inflateT>(currentBox, test))
			continue;
#endif

		const float MaxDist = maxDist;
		const bool again = pcb.invoke(maxDist, currentID);
		if(!again)
			return false;
		if(maxDist < MaxDist)
		{
			float rayMinLimit, rayMaxLimit;
#ifdef CAN_USE_MOVEMASK
			if(inflateT)
				computeRayLimits(rayMinLimit, rayMaxLimit, rayOrig, rayDir, maxDist, rayParams->mInflate, sortAxis);
			else
				computeRayLimits(rayMinLimit, rayMaxLimit, rayOrig, rayDir, maxDist, sortAxis);

			precomputeRayData(rayParams, rayOrig, rayDir, maxDist);
#else
			if(inflateT)
				computeRayLimits(rayMinLimit, rayMaxLimit, rayOrig, rayDir, maxDist, inflate, sortAxis);
			else
				computeRayLimits(rayMinLimit, rayMaxLimit, rayOrig, rayDir, maxDist, sortAxis);

			test.setDistance(maxDist);
#endif
			const PxU32* binaryMinLimit = reinterpret_cast<const PxU32*>(&rayMinLimit);
			const PxU32* binaryMaxLimit = reinterpret_cast<const PxU32*>(&rayMaxLimit);
			rayMinLimitInt = encodeFloat(binaryMinLimit[0]);
			rayMaxLimitInt = encodeFloat(binaryMaxLimit[0]);
		}
	}
Exit:

	_rayMinLimitInt = rayMinLimitInt;
	_rayMaxLimitInt = rayMaxLimitInt;
	return true;
}

#ifdef NODE_SORT
static PxU32 computeDirMask(const PxVec3& dir)
{
	const PxU32* binary = reinterpret_cast<const PxU32*>(&dir.x);
	const PxU32 X = (binary[0])>>31;
	const PxU32 Y = (binary[1])>>31;
	const PxU32 Z = (binary[2])>>31;
	return Z|(Y<<1)|(X<<2);
}
#endif

template <int inflateT>
static bool stab(const BucketPrunerCore& core, PrunerRaycastCallback& pcbArgName, const PxVec3& rayOrig, const PxVec3& rayDir, float& maxDist, const PxVec3 inflate)
{
	const PxU32 nb = core.mSortedNb;
	if(!nb
#ifdef FREE_PRUNER_SIZE
		&& !core.mNbFree
#endif
		)
		return true;

	if(maxDist==PX_MAX_F32)
	{
		/*const*/ PxVec3 boxMin = core.mGlobalBox.getMin() - inflate;
		/*const*/ PxVec3 boxMax = core.mGlobalBox.getMax() + inflate;

#ifdef FREE_PRUNER_SIZE
		if(core.mNbFree)
		{
			// TODO: optimize this
			PxBounds3 freeGlobalBounds;
			freeGlobalBounds.setEmpty();
			for(PxU32 i=0;i<core.mNbFree;i++)
				freeGlobalBounds.include(core.mFreeBounds[i]);
			freeGlobalBounds.minimum -= inflate;
			freeGlobalBounds.maximum += inflate;
			boxMin = boxMin.minimum(freeGlobalBounds.minimum);
			boxMax = boxMax.maximum(freeGlobalBounds.maximum);
		}
#endif
		clipRay(rayOrig, rayDir, maxDist, boxMin, boxMax);
	}

#ifdef CAN_USE_MOVEMASK
	RayParams rayParams;
	#ifdef USE_SIMD
	rayParams.padding0 = rayParams.padding1 = rayParams.padding2 = rayParams.padding3 = 0.0f;
	#endif
	if(inflateT)
		rayParams.mInflate = inflate;

	precomputeRayData(&rayParams, rayOrig, rayDir, maxDist);
#else
	BPRayAABBTest test(rayOrig, rayDir, maxDist, inflateT ? inflate : PxVec3(0.0f));
#endif

#ifdef FREE_PRUNER_SIZE
	BucketPrunerRaycastAdapter pcb(pcbArgName, core.mFreeObjects, core.mFreeTransforms);

	for(PxU32 i=0;i<core.mNbFree;i++)
	{
		BucketBox tmp;
		tmp.mCenter = core.mFreeBounds[i].getCenter();
		tmp.mExtents = core.mFreeBounds[i].getExtents();

#ifdef CAN_USE_MOVEMASK
		if(segmentAABB<inflateT>(tmp, &rayParams))
#else
		if(segmentAABB<inflateT>(tmp, test))
#endif
		{
			if(!pcb.invoke(maxDist, i))
				return false;
		}
	}
#endif

	if(!nb)
		return true;

#ifdef CAN_USE_MOVEMASK
	if(!segmentAABB<inflateT>(core.mGlobalBox, &rayParams))
		return true;
#else
	if(!segmentAABB<inflateT>(core.mGlobalBox, test))
		return true;
#endif

	const PxU32 sortAxis = core.mSortAxis;
	float rayMinLimit, rayMaxLimit;
	if(inflateT)
		computeRayLimits(rayMinLimit, rayMaxLimit, rayOrig, rayDir, maxDist, inflate, sortAxis);
	else
		computeRayLimits(rayMinLimit, rayMaxLimit, rayOrig, rayDir, maxDist, sortAxis);

	const PxU32* binaryMinLimit = reinterpret_cast<const PxU32*>(&rayMinLimit);
	const PxU32* binaryMaxLimit = reinterpret_cast<const PxU32*>(&rayMaxLimit);
	PxU32 rayMinLimitInt = encodeFloat(binaryMinLimit[0]);
	PxU32 rayMaxLimitInt = encodeFloat(binaryMaxLimit[0]);
/*
float rayMinLimitX, rayMaxLimitX;
if(inflateT)
	computeRayLimits(rayMinLimitX, rayMaxLimitX, rayOrig, rayDir, maxDist, inflate, 0);
else
	computeRayLimits(rayMinLimitX, rayMaxLimitX, rayOrig, rayDir, maxDist, 0);

PxU32 rayMinLimitIntX = encodeFloat(PX_IR(rayMinLimitX));
PxU32 rayMaxLimitIntX = encodeFloat(PX_IR(rayMaxLimitX));
*/

	float currentDist = maxDist;

#ifdef NODE_SORT
	const PxU32 dirIndex = computeDirMask(rayDir);
	PxU32 orderi = core.mLevel1.mOrder[dirIndex];
//	PxU32 orderi = sort(core.mLevel1, rayDir);

	for(PxU32 i_=0;i_<5;i_++)
	{
		const PxU32 i = orderi&7;	orderi>>=3;
#else
	for(PxU32 i=0;i<5;i++)
	{
#endif

#ifdef CAN_USE_MOVEMASK
		if(core.mLevel1.mCounters[i] && segmentAABB<inflateT>(core.mLevel1.mBucketBox[i], &rayParams))
#else
		if(core.mLevel1.mCounters[i] && segmentAABB<inflateT>(core.mLevel1.mBucketBox[i], test))
//		if(core.mLevel1.mCounters[i] && segmentAABB<inflateT>(core.mLevel1.mBucketBox[i], test, rayMinLimitIntX, rayMaxLimitIntX))
#endif
		{

#ifdef NODE_SORT
			PxU32 orderj = core.mLevel2[i].mOrder[dirIndex];
//			PxU32 orderj = sort(core.mLevel2[i], rayDir);

			for(PxU32 j_=0;j_<5;j_++)
			{
				const PxU32 j = orderj&7;	orderj>>=3;
#else
			for(PxU32 j=0;j<5;j++)
			{
#endif

#ifdef CAN_USE_MOVEMASK
				if(core.mLevel2[i].mCounters[j] && segmentAABB<inflateT>(core.mLevel2[i].mBucketBox[j], &rayParams))
#else
				if(core.mLevel2[i].mCounters[j] && segmentAABB<inflateT>(core.mLevel2[i].mBucketBox[j], test))
//				if(core.mLevel2[i].mCounters[j] && segmentAABB<inflateT>(core.mLevel2[i].mBucketBox[j], test, rayMinLimitIntX, rayMaxLimitIntX))
#endif
				{
					const BucketPrunerNode& parent = core.mLevel3[i][j];
					const PxU32 parentOffset = core.mLevel1.mOffsets[i] + core.mLevel2[i].mOffsets[j];

#ifdef NODE_SORT
					PxU32 orderk = parent.mOrder[dirIndex];
//					PxU32 orderk = sort(parent, rayDir);

					for(PxU32 k_=0;k_<5;k_++)
					{
						const PxU32 k = orderk&7;	orderk>>=3;
#else
					for(PxU32 k=0;k<5;k++)
					{
#endif
						const PxU32 nbInBucket = parent.mCounters[k];
#ifdef CAN_USE_MOVEMASK
						if(nbInBucket && segmentAABB<inflateT>(parent.mBucketBox[k], &rayParams))
#else
						if(nbInBucket && segmentAABB<inflateT>(parent.mBucketBox[k], test))
//						if(nbInBucket && segmentAABB<inflateT>(parent.mBucketBox[k], test, rayMinLimitIntX, rayMaxLimitIntX))
#endif
						{
							const PxU32 offset = parentOffset + parent.mOffsets[k];
							const bool again = processBucket<inflateT>(	nbInBucket, core.mSortedWorldBoxes, core.mSortedObjects,
																		core.mSortedTransforms,
																		offset, core.mSortedNb,
																		rayOrig, rayDir, currentDist,
#ifdef CAN_USE_MOVEMASK
																		&rayParams,
#else
																		test, inflate,
#endif
																		pcbArgName,
																		rayMinLimitInt, rayMaxLimitInt,
																		sortAxis);
							if(!again)
								return false;
						}
					}
				}
			}
		}
	}

	maxDist = currentDist;
	return true;
}
		
bool BucketPrunerCore::raycast(const PxVec3& origin, const PxVec3& unitDir, PxReal& inOutDistance, PrunerRaycastCallback& pcb) const
{
	return ::stab<0>(*this, pcb, origin, unitDir, inOutDistance, PxVec3(0.0f));
}

bool BucketPrunerCore::sweep(const ShapeData& queryVolume, const PxVec3& unitDir, PxReal& inOutDistance, PrunerRaycastCallback& pcb) const
{
	const PxVec3 extents = queryVolume.getPrunerInflatedWorldAABB().getExtents();
	return ::stab<1>(*this, pcb, queryVolume.getPrunerInflatedWorldAABB().getCenter(), unitDir, inOutDistance, extents);
}

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

// PT: TODO: decoupling the pruner callback revealed quite a bug here: we call this processBucket function with an inflateT param,
// which is re-interpreted as "doAssert" for overlaps! What happened here?
template<bool doAssert, typename Test>
static PX_FORCE_INLINE bool processBucket(	PxU32 nb, const BucketBox* PX_RESTRICT baseBoxes, const PrunerPayload* PX_RESTRICT baseObjects,
											const PxTransform* PX_RESTRICT baseTransforms,
											PxU32 offset, PxU32 totalAllocated,
											const Test& test, PrunerOverlapCallback& pcbArgName,
											PxU32 minLimitInt, PxU32 maxLimitInt)
{
	PX_UNUSED(totalAllocated);

	const BucketBox* PX_RESTRICT boxes = baseBoxes + offset;

	BucketPrunerOverlapAdapter pcb(pcbArgName, baseObjects + offset, baseTransforms + offset);

	for(PxU32 i=0;i<nb;i++)
	{
		const BucketBox& currentBox = *boxes++;

		if(currentBox.mData1<minLimitInt)
		{
			if(doAssert)
				PX_ASSERT(!test(currentBox));
			continue;
		}

		if(currentBox.mData0>maxLimitInt)
		{
			if(doAssert)
				PX_ASSERT(!test(currentBox));
			return true;
		}

		if(test(currentBox))
		{
			if(!pcb.invoke(i))
				return false;
		}
	}
	return true;
}

template<typename Test, bool isPrecise>
class BucketPrunerOverlapTraversal
{
public:
	PX_FORCE_INLINE BucketPrunerOverlapTraversal() {}

	/*PX_FORCE_INLINE*/ bool operator()(const BucketPrunerCore& core, const Test& test, PrunerOverlapCallback& pcbArgName, const PxBounds3& cullBox) const
	{
#ifdef FREE_PRUNER_SIZE
		BucketPrunerOverlapAdapter pcb(pcbArgName, core.mFreeObjects, core.mFreeTransforms);

		for(PxU32 i=0;i<core.mNbFree;i++)
		{
			if(test(core.mFreeBounds[i]))
			{
				if(!pcb.invoke(i))
					return false;
			}
		}
#endif
		const PxU32 nb = core.mSortedNb;
		if(!nb)
			return true;

#ifdef BRUTE_FORCE_LIMIT
		if(nb<=BRUTE_FORCE_LIMIT)
		{
			for(PxU32 i=0;i<nb;i++)
			{
				if(test(core.mSortedWorldBoxes[i]))
				{
					PxReal dist = -1.0f; // no distance for overlaps
					if(!pcb.invoke(dist, core.mSortedObjects[i]))
						return false;
				}
			}
			return true;
		}
#endif

		if(!test(core.mGlobalBox))
			return true;

		const PxU32 sortAxis = core.mSortAxis;
		const float boxMinLimit = cullBox.minimum[sortAxis];
		const float boxMaxLimit = cullBox.maximum[sortAxis];

		const PxU32* binaryMinLimit = reinterpret_cast<const PxU32*>(&boxMinLimit);
		const PxU32* binaryMaxLimit = reinterpret_cast<const PxU32*>(&boxMaxLimit);
		const PxU32 rayMinLimitInt = encodeFloat(binaryMinLimit[0]);
		const PxU32 rayMaxLimitInt = encodeFloat(binaryMaxLimit[0]);

		for(PxU32 i=0;i<5;i++)
		{
			if(core.mLevel1.mCounters[i] && test(core.mLevel1.mBucketBox[i]))
			{
				for(PxU32 j=0;j<5;j++)
				{
					if(core.mLevel2[i].mCounters[j] && test(core.mLevel2[i].mBucketBox[j]))
					{
						for(PxU32 k=0;k<5;k++)
						{
							const PxU32 nbInBucket = core.mLevel3[i][j].mCounters[k];
							if(nbInBucket && test(core.mLevel3[i][j].mBucketBox[k]))
							{
								const PxU32 offset = core.mLevel1.mOffsets[i] + core.mLevel2[i].mOffsets[j] + core.mLevel3[i][j].mOffsets[k];
								if(!processBucket<isPrecise>(nbInBucket, core.mSortedWorldBoxes, core.mSortedObjects,
									core.mSortedTransforms,
									offset, core.mSortedNb, test, pcbArgName, rayMinLimitInt, rayMaxLimitInt))
									return false;
							}
						}
					}
				}
			}
		}
		return true;
	}
};

///////////////////////////////////////////////////////////////////////////////

#ifdef CAN_USE_MOVEMASK
PX_FORCE_INLINE PxU32 BAllTrue3_R(const BoolV a)
{
	const PxI32 moveMask = _mm_movemask_ps(a);
	return PxU32((moveMask & 0x7) == (0x7));
}
#endif

#ifdef USE_SIMD
struct SphereAABBTest_SIMD
{
	PX_FORCE_INLINE SphereAABBTest_SIMD(const Sphere& sphere) :
	#ifdef CAN_USE_MOVEMASK
		mCenter	(V4LoadU(&sphere.center.x)),
	#else
		mCenter	(V3LoadU(sphere.center)),
	#endif
		mRadius2(FLoad(sphere.radius * sphere.radius))
	{}

	PX_FORCE_INLINE PxIntBool operator()(const BucketBox& box) const		
	{
	#ifdef CAN_USE_MOVEMASK
		const Vec4V boxCenter = AlignedLoad(&box.mCenter.x);
		const Vec4V boxExtents = AlignedLoad(&box.mExtents.x);
		//
		const Vec4V offset = V4Sub(mCenter, boxCenter);
		const Vec4V closest = V4Clamp(offset, V4Neg(boxExtents), boxExtents);
		const Vec4V d = V4Sub(offset, closest);

		const FloatV dot = V4Dot3(d,d); 
		return PxIntBool(BAllTrue3_R(FIsGrtrOrEq(mRadius2, dot)));
	#else
		const Vec3V boxCenter = V3LoadU(box.mCenter);
		const Vec3V boxExtents = V3LoadU(box.mExtents);
		//
		const Vec3V offset = V3Sub(mCenter, boxCenter);
		const Vec3V closest = V3Clamp(offset, V3Neg(boxExtents), boxExtents);
		const Vec3V d = V3Sub(offset, closest);
		return PxIntBool(BAllEqTTTT(FIsGrtrOrEq(mRadius2, V3Dot(d, d))));
	#endif
	}

	PX_FORCE_INLINE PxIntBool operator()(const PxBounds3& bounds) const		
	{
		BucketBox tmp;
		tmp.mCenter = bounds.getCenter();
		tmp.mExtents = bounds.getExtents();
		return (*this)(tmp);
	}

private:
	SphereAABBTest_SIMD& operator=(const SphereAABBTest_SIMD&);
	#ifdef CAN_USE_MOVEMASK
	const Vec4V		mCenter;
	#else
	const Vec3V		mCenter;
	#endif
	const FloatV	mRadius2;
};
#else
struct SphereAABBTest_Scalar
{
	PX_FORCE_INLINE SphereAABBTest_Scalar(const Sphere& sphere) :
		mCenter	(sphere.center),
		mRadius2(sphere.radius * sphere.radius)
	{}

	PX_FORCE_INLINE PxIntBool operator()(const BucketBox& box) const		
	{
		const PxVec3 minimum = box.getMin();
		const PxVec3 maximum = box.getMax();

		float d = 0.0f;

		//find the square of the distance
		//from the sphere to the box
		for(PxU32 i=0;i<3;i++)
		{
			if(mCenter[i]<minimum[i])
			{
				const float s = mCenter[i] - minimum[i];
				d += s*s;
			}
			else if(mCenter[i]>maximum[i])
			{
				const float s = mCenter[i] - maximum[i];
				d += s*s;
			}
		}
		return d <= mRadius2;
	}

private:
	SphereAABBTest_Scalar& operator=(const SphereAABBTest_Scalar&);
	const PxVec3	mCenter;
	float			mRadius2;
};
#endif

#ifdef USE_SIMD
typedef SphereAABBTest_SIMD		BucketPrunerSphereAABBTest;
#else
typedef SphereAABBTest_Scalar	BucketPrunerSphereAABBTest;
#endif

///////////////////////////////////////////////////////////////////////////////

struct BucketPrunerAABBAABBTest
{
	PX_FORCE_INLINE BucketPrunerAABBAABBTest(const PxBounds3& queryBox) : mBox(queryBox)	{}

	PX_FORCE_INLINE PxIntBool operator()(const BucketBox& box) const
	{	
		// PT: we don't use PxBounds3::intersects() because isValid() asserts on our empty boxes!
		const PxVec3 bucketMin = box.getMin();
		const PxVec3 bucketMax = box.getMax();
		return !(mBox.minimum.x > bucketMax.x || bucketMin.x > mBox.maximum.x ||
				 mBox.minimum.y > bucketMax.y || bucketMin.y > mBox.maximum.y ||
				 mBox.minimum.z > bucketMax.z || bucketMin.z > mBox.maximum.z);
	}

	PX_FORCE_INLINE PxIntBool operator()(const PxBounds3& bounds) const
	{	
		// PT: we don't use PxBounds3::intersects() because isValid() asserts on our empty boxes!
		const PxVec3& bucketMin = bounds.minimum;
		const PxVec3& bucketMax = bounds.maximum;
		return !(mBox.minimum.x > bucketMax.x || bucketMin.x > mBox.maximum.x ||
				 mBox.minimum.y > bucketMax.y || bucketMin.y > mBox.maximum.y ||
				 mBox.minimum.z > bucketMax.z || bucketMin.z > mBox.maximum.z);
	}
private:
	BucketPrunerAABBAABBTest& operator=(const BucketPrunerAABBAABBTest&);
	const PxBounds3	mBox;
};

/*struct BucketPrunerAABBAABBTest_SIMD
{
	PX_FORCE_INLINE BucketPrunerAABBAABBTest_SIMD(const PxBounds3& b)
	: mCenter(V3LoadU(b.getCenter()))
	, mExtents(V3LoadU(b.getExtents()))
	{}

	PX_FORCE_INLINE PxIntBool operator()(const BucketBox& box) const
	{	
		return V3AllGrtrOrEq(V3Add(mExtents, AlignedLoad(&box.mExtents.x)), V3Abs(V3Sub(AlignedLoad(&box.mCenter.x), mCenter)));
	}
private:
	BucketPrunerAABBAABBTest_SIMD& operator=(const BucketPrunerAABBAABBTest_SIMD&);
	const Vec3V mCenter, mExtents;
};*/

///////////////////////////////////////////////////////////////////////////////

#ifdef USE_SIMD
struct OBBAABBTest_SIMD
{
	OBBAABBTest_SIMD(const PxMat33& rotation, const PxVec3& translation, const PxVec3& extents)
	{
		const Vec3V eps = V3Load(1e-6f);

		mT = V3LoadU(translation);
		mExtents = V3LoadU(extents);	

		// storing the transpose matrices yields a simpler SIMD test
		mRT = Mat33V_From_PxMat33(rotation.getTranspose());	
		mART = Mat33V(V3Add(V3Abs(mRT.col0), eps), V3Add(V3Abs(mRT.col1), eps), V3Add(V3Abs(mRT.col2), eps));
		mBB_xyz = M33TrnspsMulV3(mART, mExtents);

/*		if(fullTest)
		{
			const Vec3V eYZX = V3PermYZX(mExtents), eZXY = V3PermZXY(mExtents);

			mBB_123 = V3MulAdd(eYZX, V3PermZXY(mART.col0), V3Mul(eZXY, V3PermYZX(mART.col0)));
			mBB_456 = V3MulAdd(eYZX, V3PermZXY(mART.col1), V3Mul(eZXY, V3PermYZX(mART.col1)));
			mBB_789 = V3MulAdd(eYZX, V3PermZXY(mART.col2), V3Mul(eZXY, V3PermYZX(mART.col2)));
		}*/
	}

	PX_FORCE_INLINE PxIntBool operator()(const BucketBox& box) const	
	{	
		const Vec3V extentsV = V3LoadU(box.mExtents);

		const Vec3V t = V3Sub(mT, V3LoadU(box.mCenter));

		// class I - axes of AABB
		if(V3OutOfBounds(t, V3Add(extentsV, mBB_xyz)))
			return PxIntFalse;

		const Vec3V rX = mRT.col0, rY = mRT.col1, rZ = mRT.col2;
		const Vec3V arX = mART.col0, arY = mART.col1, arZ = mART.col2;

		const FloatV eX = V3GetX(extentsV), eY = V3GetY(extentsV), eZ = V3GetZ(extentsV);
		const FloatV tX = V3GetX(t), tY = V3GetY(t), tZ = V3GetZ(t);

		// class II - axes of OBB
		{
			const Vec3V v = V3ScaleAdd(rZ, tZ, V3ScaleAdd(rY, tY, V3Scale(rX, tX)));
			const Vec3V v2 = V3ScaleAdd(arZ, eZ, V3ScaleAdd(arY, eY, V3ScaleAdd(arX, eX, mExtents)));
			if(V3OutOfBounds(v, v2))
				return PxIntFalse;
		}

//		if(!fullTest)
			return PxIntTrue;

/*		// class III - edge cross products. Almost all OBB tests early-out with type I or type II,
		// so early-outs here probably aren't useful (TODO: profile)

		const Vec3V va = V3NegScaleSub(rZ, tY, V3Scale(rY, tZ));
		const Vec3V va2 = V3ScaleAdd(arY, eZ, V3ScaleAdd(arZ, eY, mBB_123));
		const BoolV ba = BOr(V3IsGrtr(va, va2), V3IsGrtr(V3Neg(va2), va));
	
		const Vec3V vb = V3NegScaleSub(rX, tZ, V3Scale(rZ, tX));
		const Vec3V vb2 = V3ScaleAdd(arX, eZ, V3ScaleAdd(arZ, eX, mBB_456));
		const BoolV bb = BOr(V3IsGrtr(vb, vb2), V3IsGrtr(V3Neg(vb2), vb));
		
		const Vec3V vc = V3NegScaleSub(rY, tX, V3Scale(rX, tY));
		const Vec3V vc2 = V3ScaleAdd(arX, eY, V3ScaleAdd(arY, eX, mBB_789));
		const BoolV bc = BOr(V3IsGrtr(vc, vc2), V3IsGrtr(V3Neg(vc2), vc));

		return BAllEq(BOr(ba, BOr(bb,bc)), BFFFF());*/
	}

	PX_FORCE_INLINE PxIntBool operator()(const PxBounds3& bounds) const	
	{	
		BucketBox tmp;
		tmp.mCenter = bounds.getCenter();
		tmp.mExtents = bounds.getExtents();
		return (*this)(tmp);
	}

	Vec3V		mExtents;	// extents of OBB
	Vec3V		mT;			// translation of OBB
	Mat33V		mRT;		// transpose of rotation matrix of OBB
	Mat33V		mART;		// transpose of mRT, padded by epsilon
	Vec3V		mBB_xyz;	// extents of OBB along coordinate axes

/*	Vec3V		mBB_123;	// projections of extents onto edge-cross axes
	Vec3V		mBB_456;
	Vec3V		mBB_789;*/
};
#else
struct OBBAABBTest_Scalar
{
	OBBAABBTest_Scalar(const PxMat33& rotation, const PxVec3& translation, const PxVec3& extents)
	{
		mR = rotation;
		mT = translation;
		mExtents = extents;

		const PxVec3 eps(1e-6f);		
		mAR = PxMat33(mR[0].abs() + eps, mR[1].abs() + eps, mR[2].abs() + eps);			// Epsilon prevents floating-point inaccuracies (strategy borrowed from RAPID)
		mBB_xyz = mAR.transform(mExtents);												// Precompute box-box data - Courtesy of Erwin de Vries

/*		PxReal ex = mExtents.x, ey = mExtents.y, ez = mExtents.z;
		mBB_1 = ey*mAR[2].x + ez*mAR[1].x; mBB_2 = ez*mAR[0].x + ex*mAR[2].x; mBB_3 = ex*mAR[1].x + ey*mAR[0].x;
		mBB_4 = ey*mAR[2].y + ez*mAR[1].y; mBB_5 = ez*mAR[0].y + ex*mAR[2].y; mBB_6 = ex*mAR[1].y + ey*mAR[0].y;
		mBB_7 = ey*mAR[2].z + ez*mAR[1].z; mBB_8 = ez*mAR[0].z + ex*mAR[2].z; mBB_9 = ex*mAR[1].z + ey*mAR[0].z;*/
	}

	PX_FORCE_INLINE PxIntBool operator()(const BucketBox& box) const	
	{
		const PxVec3& c = box.mCenter;
		const PxVec3& e = box.mExtents;

		const PxVec3 T = mT - c;
		// Class I : A's basis vectors
		if(PxAbs(T.x) > e.x + mBB_xyz.x)	return PxIntFalse;
		if(PxAbs(T.y) > e.y + mBB_xyz.y)	return PxIntFalse;
		if(PxAbs(T.z) > e.z + mBB_xyz.z)	return PxIntFalse;

		// Class II : B's basis vectors
		if(PxAbs(T.dot(mR[0])) > e.dot(mAR[0]) + mExtents.x)	return PxIntFalse;
		if(PxAbs(T.dot(mR[1])) > e.dot(mAR[1]) + mExtents.y)	return PxIntFalse;
		if(PxAbs(T.dot(mR[2])) > e.dot(mAR[2]) + mExtents.z)	return PxIntFalse;

		// Class III : 9 cross products
		if(0)
		{
			if(PxAbs(T.z*mR[0].y - T.y*mR[0].z) > e.y*mAR[0].z + e.z*mAR[0].y + mBB_1) return PxIntFalse;	// L = A0 x B0
			if(PxAbs(T.z*mR[1].y - T.y*mR[1].z) > e.y*mAR[1].z + e.z*mAR[1].y + mBB_2) return PxIntFalse;	// L = A0 x B1
			if(PxAbs(T.z*mR[2].y - T.y*mR[2].z) > e.y*mAR[2].z + e.z*mAR[2].y + mBB_3) return PxIntFalse;	// L = A0 x B2

			if(PxAbs(T.x*mR[0].z - T.z*mR[0].x) > e.x*mAR[0].z + e.z*mAR[0].x + mBB_4) return PxIntFalse;	// L = A1 x B0
			if(PxAbs(T.x*mR[1].z - T.z*mR[1].x) > e.x*mAR[1].z + e.z*mAR[1].x + mBB_5) return PxIntFalse;	// L = A1 x B1
			if(PxAbs(T.x*mR[2].z - T.z*mR[2].x) > e.x*mAR[2].z + e.z*mAR[2].x + mBB_6) return PxIntFalse;	// L = A1 x B2

			if(PxAbs(T.y*mR[0].x - T.x*mR[0].y) > e.x*mAR[0].y + e.y*mAR[0].x + mBB_7) return PxIntFalse;	// L = A2 x B0
			if(PxAbs(T.y*mR[1].x - T.x*mR[1].y) > e.x*mAR[1].y + e.y*mAR[1].x + mBB_8) return PxIntFalse;	// L = A2 x B1
			if(PxAbs(T.y*mR[2].x - T.x*mR[2].y) > e.x*mAR[2].y + e.y*mAR[2].x + mBB_9) return PxIntFalse;	// L = A2 x B2
		}
		return PxIntTrue;
	}

private:
	PxMat33		mR;					// rotation matrix
	PxMat33		mAR;				// absolute rotation matrix
	PxVec3		mT;					// translation from obb space to model space
	PxVec3		mExtents;

	PxVec3		mBB_xyz;

	float		mBB_1, mBB_2, mBB_3;
	float		mBB_4, mBB_5, mBB_6;
	float		mBB_7, mBB_8, mBB_9;
};
#endif

#ifdef USE_SIMD
typedef OBBAABBTest_SIMD	BucketPrunerOBBAABBTest;
#else
typedef OBBAABBTest_Scalar	BucketPrunerOBBAABBTest;
#endif

///////////////////////////////////////////////////////////////////////////////

bool BucketPrunerCore::overlap(const ShapeData& queryVolume, PrunerOverlapCallback& pcb) const
{
	PX_ASSERT(!mDirty);
	bool again = true;

	const PxBounds3& cullBox = queryVolume.getPrunerInflatedWorldAABB();

	switch(queryVolume.getType())
	{
		case PxGeometryType::eBOX:
		{
			if(queryVolume.isOBB())
			{	
				const BucketPrunerOverlapTraversal<BucketPrunerOBBAABBTest, false> overlap;
				again = overlap(*this,
							BucketPrunerOBBAABBTest(
								queryVolume.getPrunerWorldRot33(), queryVolume.getPrunerWorldPos(),
								queryVolume.getPrunerBoxGeomExtentsInflated()),
								pcb, cullBox);
			}
			else
			{
				const BucketPrunerOverlapTraversal<BucketPrunerAABBAABBTest, true> overlap;
				again = overlap(*this, BucketPrunerAABBAABBTest(cullBox), pcb, cullBox);
			}
		}
		break;

		case PxGeometryType::eCAPSULE:
		{
			const BucketPrunerOverlapTraversal<BucketPrunerOBBAABBTest, false> overlap;
			again = overlap(*this,
						BucketPrunerOBBAABBTest(
							queryVolume.getPrunerWorldRot33(), queryVolume.getPrunerWorldPos(),
							queryVolume.getPrunerBoxGeomExtentsInflated()),
							pcb, cullBox);
		}
		break;

		case PxGeometryType::eSPHERE:
		{
			const Sphere& sphere = queryVolume.getGuSphere();
			const PxVec3 sphereExtents(sphere.radius);
			const BucketPrunerOverlapTraversal<BucketPrunerSphereAABBTest, true> overlap;
			again = overlap(*this, BucketPrunerSphereAABBTest(sphere), pcb, cullBox);
		}
		break;

		case PxGeometryType::eCONVEXMESH:
		{
			const BucketPrunerOverlapTraversal<BucketPrunerOBBAABBTest, false> overlap;
			again = overlap(*this,
						BucketPrunerOBBAABBTest(
							queryVolume.getPrunerWorldRot33(), queryVolume.getPrunerWorldPos(),
							queryVolume.getPrunerBoxGeomExtentsInflated()),
							pcb, cullBox);
		}
		break;

		default:
			PX_ALWAYS_ASSERT_MESSAGE("unsupported overlap query volume geometry type");
	}
	return again;
}

///////////////////////////////////////////////////////////////////////////////

void BucketPrunerCore::getGlobalBounds(PxBounds3& bounds) const
{
	// PT: TODO: refactor with similar code above in the file
	const Vec4V centerV = V4LoadU(&mGlobalBox.mCenter.x);
	const Vec4V extentsV = V4LoadU(&mGlobalBox.mExtents.x);

	Vec4V minV = V4Sub(centerV, extentsV);
	Vec4V maxV = V4Add(centerV, extentsV);

#ifdef FREE_PRUNER_SIZE
	PxU32 nbFree = mNbFree;
	if(nbFree)
	{
		const PxBounds3* freeBounds = mFreeBounds;
		while(nbFree--)
		{
			minV = V4Min(minV, V4LoadU(&freeBounds->minimum.x));
			maxV = V4Max(maxV, V4LoadU(&freeBounds->maximum.x));
			freeBounds++;
		}
	}
#endif

	StoreBounds(bounds, minV, maxV);
}

///////////////////////////////////////////////////////////////////////////////

void BucketPrunerCore::shiftOrigin(const PxVec3& shift)
{
#ifdef FREE_PRUNER_SIZE
	for(PxU32 i=0;i<mNbFree;i++)
	{
		mFreeBounds[i].minimum -= shift;
		mFreeBounds[i].maximum -= shift;
		mFreeTransforms[i].p -= shift;
	}
#endif
	const PxU32 nb = mCoreNbObjects;
	//if (nb)
	{
		mGlobalBox.mCenter -= shift;

	#if PX_DEBUG
		mGlobalBox.mDebugMin -= shift[mSortAxis];
	#endif

		encodeBoxMinMax(mGlobalBox, mSortAxis);

		for(PxU32 i=0; i<nb; i++)
		{
			mCoreBoxes[i].minimum -= shift;
			mCoreBoxes[i].maximum -= shift;
			mCoreTransforms[i].p -= shift;
		}

		for(PxU32 i=0; i<mSortedNb; i++)
		{
			mSortedWorldBoxes[i].mCenter -= shift;

	#if PX_DEBUG
			mSortedWorldBoxes[i].mDebugMin -= shift[mSortAxis];
	#endif
			encodeBoxMinMax(mSortedWorldBoxes[i], mSortAxis);

			mSortedTransforms[i].p -= shift;
		}

		for(PxU32 i=0; i < 5; i++)
			mLevel1.mBucketBox[i].mCenter -= shift;

		for(PxU32 i=0; i < 5; i++)
			for(PxU32 j=0; j < 5; j++)
				mLevel2[i].mBucketBox[j].mCenter -= shift;

		for(PxU32 i=0; i < 5; i++)
			for(PxU32 j=0; j < 5; j++)
				for(PxU32 k=0; k < 5; k++)
					mLevel3[i][j].mBucketBox[k].mCenter -= shift;
	}
}

///////////////////////////////////////////////////////////////////////////////

static void visualize(PxRenderOutput& out, const BucketBox& bounds)
{
	Cm::renderOutputDebugBox(out, PxBounds3(bounds.getMin(), bounds.getMax()));
}

void BucketPrunerCore::visualize(PxRenderOutput& out, PxU32 color) const
{
	const PxTransform idt = PxTransform(PxIdentity);
	out << idt;
	out << color;

	::visualize(out, mGlobalBox);

	for(PxU32 i=0;i<5;i++)
	{
		if(!mLevel1.mCounters[i])
			continue;

		::visualize(out, mLevel1.mBucketBox[i]);

		for(PxU32 j=0;j<5;j++)
		{
			if(!mLevel2[i].mCounters[j])
				continue;
				
			::visualize(out, mLevel2[i].mBucketBox[j]);

			for(PxU32 k=0;k<5;k++)
			{
				if(!mLevel3[i][j].mCounters[k])
					continue;

				::visualize(out, mLevel3[i][j].mBucketBox[k]);
			}
		}
	}
}

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

BucketPruner::BucketPruner(PxU64 contextID) : mPool(contextID, TRANSFORM_CACHE_GLOBAL)
{
}

BucketPruner::~BucketPruner()
{
}

static PX_FORCE_INLINE void setExternalMemory(BucketPrunerCore& core, PruningPool& pool)
{
	core.mDirty = true;

	core.setExternalMemory(pool.getNbActiveObjects(), pool.getCurrentWorldBoxes(), pool.getObjects(), pool.getTransforms());
}

bool BucketPruner::addObjects(PrunerHandle* results, const PxBounds3* bounds, const PrunerPayload* data, const PxTransform* transforms, PxU32 count, bool)
{
	if(!count)
		return true;

	const PxU32 valid = mPool.addObjects(results, bounds, data, transforms, count);

	::setExternalMemory(mCore, mPool);

	return valid == count;
}

void BucketPruner::removeObjects(const PrunerHandle* handles, PxU32 count, PrunerPayloadRemovalCallback* removalCallback)
{
	if(!count)
		return;

	for(PxU32 i=0;i<count;i++)
		mPool.removeObject(handles[i], removalCallback);

	::setExternalMemory(mCore, mPool);
}

void BucketPruner::updateObjects(const PrunerHandle* handles, PxU32 count, float inflation, const PxU32* boundsIndices, const PxBounds3* newBounds, const PxTransform32* newTransforms)
{
	if(!count)
		return;

	if(handles && boundsIndices && newBounds)
		mPool.updateAndInflateBounds(handles, boundsIndices, newBounds, newTransforms, count, inflation);

	::setExternalMemory(mCore, mPool);
}

void BucketPruner::purge()
{
}

void BucketPruner::commit()
{
	mCore.build();
}

void BucketPruner::merge(const void*)
{
	// merge not implemented for bucket pruner
}

void BucketPruner::shiftOrigin(const PxVec3& shift)
{
	mCore.shiftOrigin(shift);
}

bool BucketPruner::sweep(const ShapeData& queryVolume, const PxVec3& unitDir, PxReal& inOutDistance, PrunerRaycastCallback& pcb) const
{
	PX_ASSERT(!mCore.mDirty);
	if(mCore.mDirty)
		return true; // it may crash otherwise
	return mCore.sweep(queryVolume, unitDir, inOutDistance, pcb);
}

bool BucketPruner::overlap(const ShapeData& queryVolume, PrunerOverlapCallback& pcb) const
{
	PX_ASSERT(!mCore.mDirty);
	if(mCore.mDirty)
		return true; // it may crash otherwise
	return mCore.overlap(queryVolume, pcb);
}

bool BucketPruner::raycast(const PxVec3& origin, const PxVec3& unitDir, PxReal& inOutDistance, PrunerRaycastCallback& pcb) const
{
	PX_ASSERT(!mCore.mDirty);
	if(mCore.mDirty)
		return true; // it may crash otherwise
	return mCore.raycast(origin, unitDir, inOutDistance, pcb);
}

void BucketPruner::visualize(PxRenderOutput& out, PxU32 primaryColor, PxU32 /*secondaryColor*/) const
{
	mCore.visualize(out, primaryColor);
}

void BucketPruner::getGlobalBounds(PxBounds3& bounds) const
{
	mCore.getGlobalBounds(bounds);
}

#define MBP_ALLOC(x)		PX_ALLOC(x, "BucketPruner")
#define MBP_ALLOC_TMP(x)	PX_ALLOC(x, "BucketPruner")
#define MBP_FREE(x)			PX_FREE(x)
#define	INVALID_ID			0xffffffff

#ifndef USE_REGULAR_HASH_MAP
static PX_FORCE_INLINE bool differentPair(const BucketPrunerPair& p, const PrunerPayload& data)
{
	const bool same = p.mData == data;
	return !same;
}

///////////////////////////////////////////////////////////////////////////////

BucketPrunerMap::BucketPrunerMap() :
	mHashSize		(0),
	mMask			(0),
	mNbActivePairs	(0),
	mHashTable		(NULL),
	mNext			(NULL),
	mActivePairs	(NULL),
	mReservedMemory (0)
{
}

///////////////////////////////////////////////////////////////////////////////

BucketPrunerMap::~BucketPrunerMap()
{
	purge();
}

///////////////////////////////////////////////////////////////////////////////

void BucketPrunerMap::purge()
{
	MBP_FREE(mNext);
	MBP_FREE(mActivePairs);
	MBP_FREE(mHashTable);
	mHashSize		= 0;
	mMask			= 0;
	mNbActivePairs	= 0;
}

///////////////////////////////////////////////////////////////////////////////

const BucketPrunerPair* BucketPrunerMap::findPair(const PrunerPayload& payload) const
{
	if(!mHashTable)
		return NULL;	// Nothing has been allocated yet

	// Compute hash value for this pair
	const PxU32 hashValue = PxComputeHash(payload) & mMask;

	const BucketPrunerPair* PX_RESTRICT activePairs = mActivePairs;
	const PxU32* PX_RESTRICT next = mNext;

	// Look for it in the table
	PxU32 offset = mHashTable[hashValue];
	while(offset!=INVALID_ID && differentPair(activePairs[offset], payload))
	{
		offset = next[offset];		// Better to have a separate array for this
	}
	if(offset==INVALID_ID)
		return NULL;
	PX_ASSERT(offset<mNbActivePairs);
	// Match mActivePairs[offset] => the pair is persistent
	return &activePairs[offset];
}

// Internal version saving hash computation
PX_FORCE_INLINE BucketPrunerPair* BucketPrunerMap::findPair(const PrunerPayload& payload, PxU32 hashValue) const
{
	if(!mHashTable)
		return NULL;	// Nothing has been allocated yet

	BucketPrunerPair* PX_RESTRICT activePairs = mActivePairs;
	const PxU32* PX_RESTRICT next = mNext;

	// Look for it in the table
	PxU32 offset = mHashTable[hashValue];
	while(offset!=INVALID_ID && differentPair(activePairs[offset], payload))
	{
		offset = next[offset];		// Better to have a separate array for this
	}
	if(offset==INVALID_ID)
		return NULL;
	PX_ASSERT(offset<mNbActivePairs);
	// Match mActivePairs[offset] => the pair is persistent
	return &activePairs[offset];
}

///////////////////////////////////////////////////////////////////////////////

BucketPrunerPair* BucketPrunerMap::addPair(const PrunerPayload& payload, PxU32 coreIndex, PxU32 timeStamp)
{
	PxU32 hashValue = PxComputeHash(payload) & mMask;

	{
		BucketPrunerPair* PX_RESTRICT p = findPair(payload, hashValue);
		if(p)
		{
			PX_ASSERT(p->mCoreIndex==coreIndex);
			PX_ASSERT(p->mTimeStamp==timeStamp);
			return p;	// Persistent pair
		}
	}

	// This is a new pair
	if(mNbActivePairs >= mHashSize)
	{
		// Get more entries
		mHashSize = PxNextPowerOfTwo(mNbActivePairs+1);
		mMask = mHashSize-1;

		reallocPairs();

		// Recompute hash value with new hash size
		hashValue = PxComputeHash(payload) & mMask;	// ### redundant hash computation here?
	}

	BucketPrunerPair* PX_RESTRICT p = &mActivePairs[mNbActivePairs];
	p->mData		= payload;
	p->mCoreIndex	= coreIndex;
	p->mTimeStamp	= timeStamp;
	mNext[mNbActivePairs] = mHashTable[hashValue];
	mHashTable[hashValue] = mNbActivePairs++;
	return p;
}

///////////////////////////////////////////////////////////////////////////////

void BucketPrunerMap::removePairInternal(const PrunerPayload& /*payload*/, PxU32 hashValue, PxU32 pairIndex)
{
	// Walk the hash table to fix mNext
	{
		PxU32 offset = mHashTable[hashValue];
		PX_ASSERT(offset!=INVALID_ID);

		PxU32 previous=INVALID_ID;
		while(offset!=pairIndex)
		{
			previous = offset;
			offset = mNext[offset];
		}

		// Let us go/jump us
		if(previous!=INVALID_ID)
		{
			PX_ASSERT(mNext[previous]==pairIndex);
			mNext[previous] = mNext[pairIndex];
		}
		// else we were the first
		else mHashTable[hashValue] = mNext[pairIndex];
		// we're now free to reuse mNext[pairIndex] without breaking the list
	}
#if PX_DEBUG
	mNext[pairIndex]=INVALID_ID;
#endif
	// Invalidate entry

	// Fill holes
	if(1)
	{
		// 1) Remove last pair
		const PxU32 lastPairIndex = mNbActivePairs-1;
		if(lastPairIndex==pairIndex)
		{
			mNbActivePairs--;
		}
		else
		{
			const BucketPrunerPair* last = &mActivePairs[lastPairIndex];
			const PxU32 lastHashValue = PxComputeHash(last->mData) & mMask;

			// Walk the hash table to fix mNext
			PxU32 offset = mHashTable[lastHashValue];
			PX_ASSERT(offset!=INVALID_ID);

			PxU32 previous=INVALID_ID;
			while(offset!=lastPairIndex)
			{
				previous = offset;
				offset = mNext[offset];
			}

			// Let us go/jump us
			if(previous!=INVALID_ID)
			{
				PX_ASSERT(mNext[previous]==lastPairIndex);
				mNext[previous] = mNext[lastPairIndex];
			}
			// else we were the first
			else mHashTable[lastHashValue] = mNext[lastPairIndex];
			// we're now free to reuse mNext[lastPairIndex] without breaking the list

#if PX_DEBUG
			mNext[lastPairIndex]=INVALID_ID;
#endif

			// Don't invalidate entry since we're going to shrink the array

			// 2) Re-insert in free slot
			mActivePairs[pairIndex] = mActivePairs[lastPairIndex];
#if PX_DEBUG
			PX_ASSERT(mNext[pairIndex]==INVALID_ID);
#endif
			mNext[pairIndex] = mHashTable[lastHashValue];
			mHashTable[lastHashValue] = pairIndex;

			mNbActivePairs--;
		}
	}
}

///////////////////////////////////////////////////////////////////////////////

bool BucketPrunerMap::removePair(const PrunerPayload& payload, PxU32& coreIndex, PxU32& timeStamp)
{
	const PxU32 hashValue = PxComputeHash(payload) & mMask;
	const BucketPrunerPair* p = findPair(payload, hashValue);
	if(!p)
		return false;
	PX_ASSERT(p->mData==payload);

	coreIndex = p->mCoreIndex;
	timeStamp = p->mTimeStamp;

	removePairInternal(payload, hashValue, getPairIndex(p));

	shrinkMemory();
	return true;
}

///////////////////////////////////////////////////////////////////////////////

void BucketPrunerMap::shrinkMemory()
{
	// Check correct memory against actually used memory
	const PxU32 correctHashSize = PxNextPowerOfTwo(mNbActivePairs);
	if(mHashSize==correctHashSize)
		return;

	if(mReservedMemory && correctHashSize < mReservedMemory)
		return;

	// Reduce memory used
	mHashSize = correctHashSize;
	mMask = mHashSize-1;

	reallocPairs();
}

///////////////////////////////////////////////////////////////////////////////

	static PX_FORCE_INLINE void storeDwords(PxU32* dest, PxU32 nb, PxU32 value)
	{
		while(nb--)
			*dest++ = value;
	}

void BucketPrunerMap::reallocPairs()
{
	MBP_FREE(mHashTable);
	mHashTable = reinterpret_cast<PxU32*>(MBP_ALLOC(mHashSize*sizeof(PxU32)));
	storeDwords(mHashTable, mHashSize, INVALID_ID);

	// Get some bytes for new entries
	BucketPrunerPair* newPairs	= reinterpret_cast<BucketPrunerPair*>(MBP_ALLOC(mHashSize * sizeof(BucketPrunerPair)));
	PX_ASSERT(newPairs);

	PxU32* newNext = reinterpret_cast<PxU32*>(MBP_ALLOC(mHashSize * sizeof(PxU32)));
	PX_ASSERT(newNext);

	// Copy old data if needed
	if(mNbActivePairs)
		PxMemCopy(newPairs, mActivePairs, mNbActivePairs*sizeof(BucketPrunerPair));
	// ### check it's actually needed... probably only for pairs whose hash value was cut by the and
	// yeah, since hash(id0, id1) is a constant
	// However it might not be needed to recompute them => only less efficient but still ok
	for(PxU32 i=0;i<mNbActivePairs;i++)
	{
		const PxU32 hashValue = PxComputeHash(mActivePairs[i].mData) & mMask;	// New hash value with new mask
		newNext[i] = mHashTable[hashValue];
		mHashTable[hashValue] = i;
	}

	// Delete old data
	MBP_FREE(mNext);
	MBP_FREE(mActivePairs);

	// Assign new pointer
	mActivePairs = newPairs;
	mNext = newNext;
}

///////////////////////////////////////////////////////////////////////////////

void BucketPrunerMap::reserveMemory(PxU32 memSize)
{
	if(!memSize)
		return;

	if(!PxIsPowerOfTwo(memSize))
		memSize = PxNextPowerOfTwo(memSize);

	mHashSize = memSize;
	mMask = mHashSize-1;

	mReservedMemory = memSize;

	reallocPairs();
}

///////////////////////////////////////////////////////////////////////////////
#endif