1508 lines
46 KiB
C++
1508 lines
46 KiB
C++
// Redistribution and use in source and binary forms, with or without
|
|
// modification, are permitted provided that the following conditions
|
|
// are met:
|
|
// * Redistributions of source code must retain the above copyright
|
|
// notice, this list of conditions and the following disclaimer.
|
|
// * Redistributions in binary form must reproduce the above copyright
|
|
// notice, this list of conditions and the following disclaimer in the
|
|
// documentation and/or other materials provided with the distribution.
|
|
// * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
// contributors may be used to endorse or promote products derived
|
|
// from this software without specific prior written permission.
|
|
//
|
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ''AS IS'' AND ANY
|
|
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
//
|
|
// Copyright (c) 2008-2025 NVIDIA Corporation. All rights reserved.
|
|
|
|
#include "foundation/PxAssert.h"
|
|
#include "foundation/PxAtomic.h"
|
|
#include "foundation/PxErrorCallback.h"
|
|
#include "foundation/PxMath.h"
|
|
#include "foundation/PxPreprocessor.h"
|
|
#include "foundation/PxMutex.h"
|
|
#include "foundation/PxThread.h"
|
|
#include "foundation/PxUserAllocated.h"
|
|
#include "foundation/PxString.h"
|
|
#include "foundation/PxAlloca.h"
|
|
#include "foundation/PxArray.h"
|
|
|
|
#include "PhysXDeviceSettings.h"
|
|
|
|
// from the point of view of this source file the GPU library is linked statically
|
|
#ifndef PX_PHYSX_GPU_STATIC
|
|
#define PX_PHYSX_GPU_STATIC
|
|
#endif
|
|
#include "PxPhysXGpu.h"
|
|
|
|
#if PX_LINUX && PX_CLANG
|
|
#pragma clang diagnostic push
|
|
#pragma clang diagnostic ignored "-Wdocumentation"
|
|
#pragma clang diagnostic ignored "-Wdisabled-macro-expansion"
|
|
#endif
|
|
#include <cuda.h>
|
|
#if PX_LINUX && PX_CLANG
|
|
#pragma clang diagnostic pop
|
|
#endif
|
|
|
|
#include "cudamanager/PxCudaContextManager.h"
|
|
#include "cudamanager/PxCudaContext.h"
|
|
|
|
#if PX_WIN32 || PX_WIN64
|
|
|
|
// Enable/disable NVIDIA secure load library code
|
|
#define SECURE_LOAD_LIBRARY !PX_PUBLIC_RELEASE
|
|
|
|
#include "foundation/windows/PxWindowsInclude.h"
|
|
|
|
|
|
class IDirect3DDevice9;
|
|
class IDirect3DResource9;
|
|
class IDirect3DVertexBuffer9;
|
|
#include <cudad3d9.h>
|
|
|
|
class IDXGIAdapter;
|
|
class ID3D10Device;
|
|
class ID3D10Resource;
|
|
#include <cudad3d10.h>
|
|
|
|
struct ID3D11Device;
|
|
struct ID3D11Resource;
|
|
#include <cudad3d11.h>
|
|
|
|
#endif // PX_WINDOWS_FAMILY
|
|
|
|
#if PX_LINUX
|
|
#include <dlfcn.h>
|
|
static void* GetProcAddress(void* handle, const char* name) { return dlsym(handle, name); }
|
|
#endif
|
|
|
|
// Defining these instead of including gl.h eliminates a dependency
|
|
typedef unsigned int GLenum;
|
|
typedef unsigned int GLuint;
|
|
|
|
//#include <GL/gl.h>
|
|
#include <cudaGL.h>
|
|
#include <assert.h>
|
|
|
|
#include "foundation/PxErrors.h"
|
|
#include "foundation/PxErrorCallback.h"
|
|
#include "common/PxPhysXCommonConfig.h"
|
|
|
|
namespace physx
|
|
{
|
|
|
|
#if PX_VC
|
|
#pragma warning(disable: 4191) //'operator/operation' : unsafe conversion from 'type of expression' to 'type required'
|
|
#endif
|
|
|
|
// CUDA toolkit definitions
|
|
// Update the definitions when the Cuda toolkit changes
|
|
// Refer to https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html
|
|
#define MIN_CUDA_VERSION 12000 // Use Cuda toolkit 12.0 and above
|
|
#define NV_DRIVER_MAJOR_VERSION 527
|
|
#define NV_DRIVER_MINOR_VERSION 41
|
|
#define MIN_SM_MAJOR_VERSION 7
|
|
#define MIN_SM_MINOR_VERSION 0
|
|
|
|
#define USE_DEFAULT_CUDA_STREAM 0
|
|
#define FORCE_LAUNCH_SYNCHRONOUS 0
|
|
//PX_STOMP_ALLOCATED_MEMORY is defined in common/PxPhysXCommonConfig.h
|
|
|
|
|
|
#if PX_DEBUG
|
|
#include "PxgMemoryTracker.h"
|
|
static MemTracker mMemTracker;
|
|
#endif
|
|
|
|
PxCudaContext* createCudaContext(CUdevice device, PxDeviceAllocatorCallback* callback, bool launchSynchronous);
|
|
|
|
class CudaCtxMgr : public PxCudaContextManager, public PxUserAllocated
|
|
{
|
|
public:
|
|
CudaCtxMgr(const PxCudaContextManagerDesc& desc, PxErrorCallback& errorCallback, bool launchSynchronous);
|
|
virtual ~CudaCtxMgr();
|
|
|
|
bool safeDelayImport(PxErrorCallback& errorCallback);
|
|
|
|
virtual void acquireContext() PX_OVERRIDE;
|
|
virtual void releaseContext() PX_OVERRIDE;
|
|
virtual bool tryAcquireContext() PX_OVERRIDE;
|
|
|
|
/* All these methods can be called without acquiring the context */
|
|
virtual bool contextIsValid() const PX_OVERRIDE;
|
|
virtual bool supportsArchSM10() const PX_OVERRIDE; // G80
|
|
virtual bool supportsArchSM11() const PX_OVERRIDE; // G92
|
|
virtual bool supportsArchSM12() const PX_OVERRIDE;
|
|
virtual bool supportsArchSM13() const PX_OVERRIDE; // GT200
|
|
virtual bool supportsArchSM20() const PX_OVERRIDE; // GF100
|
|
virtual bool supportsArchSM30() const PX_OVERRIDE; // GK100
|
|
virtual bool supportsArchSM35() const PX_OVERRIDE; // GK110
|
|
virtual bool supportsArchSM50() const PX_OVERRIDE; // GM100
|
|
virtual bool supportsArchSM52() const PX_OVERRIDE; // GM200
|
|
virtual bool supportsArchSM60() const PX_OVERRIDE; // GP100
|
|
virtual bool isIntegrated() const PX_OVERRIDE; // true if GPU is integrated (MCP) part
|
|
virtual bool canMapHostMemory() const PX_OVERRIDE; // true if GPU map host memory to GPU
|
|
virtual int getDriverVersion() const PX_OVERRIDE;
|
|
virtual size_t getDeviceTotalMemBytes() const PX_OVERRIDE;
|
|
virtual int getMultiprocessorCount() const PX_OVERRIDE;
|
|
virtual int getSharedMemPerBlock() const PX_OVERRIDE;
|
|
virtual int getSharedMemPerMultiprocessor() const PX_OVERRIDE;
|
|
virtual unsigned int getMaxThreadsPerBlock() const PX_OVERRIDE;
|
|
virtual unsigned int getClockRate() const PX_OVERRIDE;
|
|
|
|
virtual const char* getDeviceName() const PX_OVERRIDE;
|
|
virtual CUdevice getDevice() const PX_OVERRIDE;
|
|
|
|
virtual void setUsingConcurrentStreams(bool) PX_OVERRIDE;
|
|
virtual bool getUsingConcurrentStreams() const PX_OVERRIDE;
|
|
|
|
virtual void getDeviceMemoryInfo(size_t& free, size_t& total) const PX_OVERRIDE;
|
|
|
|
virtual void release() PX_OVERRIDE;
|
|
|
|
virtual CUcontext getContext() PX_OVERRIDE { return mCtx; }
|
|
|
|
virtual PxCudaContext* getCudaContext() PX_OVERRIDE { return mCudaCtx; }
|
|
|
|
CUmodule* getCuModules() PX_OVERRIDE { return mCuModules.begin(); }
|
|
|
|
virtual CUdeviceptr getMappedDevicePtr(void* pinnedHostBuffer) PX_OVERRIDE;
|
|
|
|
protected:
|
|
virtual void* allocDeviceBufferInternal(PxU64 numBytes, const char* filename, PxI32 line) PX_OVERRIDE;
|
|
virtual void* allocPinnedHostBufferInternal(PxU64 numBytes, const char* filename, PxI32 line) PX_OVERRIDE;
|
|
|
|
virtual void freeDeviceBufferInternal(void* deviceBuffer) PX_OVERRIDE;
|
|
virtual void freePinnedHostBufferInternal(void* pinnedHostBuffer) PX_OVERRIDE;
|
|
|
|
virtual void clearDeviceBufferAsyncInternal(void* deviceBuffer, PxU32 numBytes, CUstream stream, PxI32 value) PX_OVERRIDE;
|
|
|
|
virtual void copyDToHAsyncInternal(void* hostBuffer, const void* deviceBuffer, PxU32 numBytes, CUstream stream) PX_OVERRIDE;
|
|
virtual void copyHToDAsyncInternal(void* deviceBuffer, const void* hostBuffer, PxU32 numBytes, CUstream stream) PX_OVERRIDE;
|
|
virtual void copyDToDAsyncInternal(void* dstDeviceBuffer, const void* srcDeviceBuffer, PxU32 numBytes, CUstream stream) PX_OVERRIDE;
|
|
|
|
virtual void copyDToHInternal(void* hostBuffer, const void* deviceBuffer, PxU32 numBytes) PX_OVERRIDE;
|
|
virtual void copyHToDInternal(void* deviceBuffer, const void* hostBuffer, PxU32 numBytes) PX_OVERRIDE;
|
|
|
|
virtual void memsetD8AsyncInternal(void* dstDeviceBuffer, const PxU8& value, PxU32 numBytes, CUstream stream) PX_OVERRIDE;
|
|
virtual void memsetD32AsyncInternal(void* dstDeviceBuffer, const PxU32& value, PxU32 numIntegers, CUstream stream) PX_OVERRIDE;
|
|
|
|
private:
|
|
|
|
PxArray<CUmodule> mCuModules;
|
|
|
|
bool mIsValid;
|
|
bool mOwnContext;
|
|
CUdevice mDevHandle;
|
|
CUcontext mCtx;
|
|
PxCudaContext* mCudaCtx;
|
|
|
|
/* Cached device attributes, so threads can query w/o context */
|
|
int mComputeCapMajor;
|
|
int mComputeCapMinor;
|
|
int mIsIntegrated;
|
|
int mCanMapHost;
|
|
int mDriverVersion;
|
|
size_t mTotalMemBytes;
|
|
int mMultiprocessorCount;
|
|
int mMaxThreadsPerBlock;
|
|
char mDeviceName[128];
|
|
int mSharedMemPerBlock;
|
|
int mSharedMemPerMultiprocessor;
|
|
int mClockRate;
|
|
bool mUsingConcurrentStreams;
|
|
uint32_t mContextRefCountTls;
|
|
#if PX_DEBUG
|
|
volatile PxI32 mPushPopCount;
|
|
#endif
|
|
};
|
|
|
|
CUdeviceptr CudaCtxMgr::getMappedDevicePtr(void* pinnedHostBuffer)
|
|
{
|
|
CUdeviceptr dPtr = 0;
|
|
PxCUresult result = getCudaContext()->memHostGetDevicePointer(&dPtr, pinnedHostBuffer, 0);
|
|
if (result != CUDA_SUCCESS)
|
|
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "Getting mapped device pointer failed with error code %i!\n", PxI32(result));
|
|
return dPtr;
|
|
}
|
|
|
|
|
|
void* CudaCtxMgr::allocDeviceBufferInternal(PxU64 numBytes, const char* filename, PxI32 lineNumber)
|
|
{
|
|
numBytes = PxMax(PxU64(1u), numBytes);
|
|
PxScopedCudaLock lock(*this);
|
|
CUdeviceptr ptr;
|
|
PxCUresult result = getCudaContext()->memAlloc(&ptr, numBytes);
|
|
if (result != CUDA_SUCCESS)
|
|
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "Mem allocation failed with error code %i!\n", PxI32(result));
|
|
void* deviceBuffer = reinterpret_cast<void*>(ptr);
|
|
#if PX_DEBUG
|
|
if (deviceBuffer)
|
|
mMemTracker.registerMemory(deviceBuffer, true, numBytes, filename, lineNumber);
|
|
#else
|
|
PX_UNUSED(filename);
|
|
PX_UNUSED(lineNumber);
|
|
#endif
|
|
return deviceBuffer;
|
|
}
|
|
void* CudaCtxMgr::allocPinnedHostBufferInternal(PxU64 numBytes, const char* filename, PxI32 lineNumber)
|
|
{
|
|
numBytes = PxMax(PxU64(1u), numBytes);
|
|
PxScopedCudaLock lock(*this);
|
|
void* pinnedHostBuffer;
|
|
const unsigned int cuMemhostallocDevicemap = 0x02;
|
|
const unsigned int cuMemhostallocPortable = 0x01;
|
|
PxCUresult result = getCudaContext()->memHostAlloc(&pinnedHostBuffer, numBytes, cuMemhostallocDevicemap | cuMemhostallocPortable);
|
|
if (result != CUDA_SUCCESS)
|
|
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "Mem allocation failed with error code %i!\n", PxI32(result));
|
|
|
|
#if PX_DEBUG
|
|
mMemTracker.registerMemory(pinnedHostBuffer, false, numBytes, filename, lineNumber);
|
|
#else
|
|
PX_UNUSED(filename);
|
|
PX_UNUSED(lineNumber);
|
|
#endif
|
|
return pinnedHostBuffer;
|
|
}
|
|
|
|
void CudaCtxMgr::freeDeviceBufferInternal(void* deviceBuffer)
|
|
{
|
|
if (!deviceBuffer)
|
|
return;
|
|
PxScopedCudaLock lock(*this);
|
|
PxCUresult result = getCudaContext()->memFree(CUdeviceptr(deviceBuffer));
|
|
if (result != CUDA_SUCCESS)
|
|
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "Mem free failed with error code %i!\n", PxI32(result));
|
|
#if PX_DEBUG
|
|
mMemTracker.unregisterMemory(deviceBuffer, true);
|
|
#endif
|
|
}
|
|
void CudaCtxMgr::freePinnedHostBufferInternal(void* pinnedHostBuffer)
|
|
{
|
|
if (!pinnedHostBuffer)
|
|
return;
|
|
PxScopedCudaLock lock(*this);
|
|
PxCUresult result = getCudaContext()->memFreeHost(pinnedHostBuffer);
|
|
if (result != CUDA_SUCCESS)
|
|
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "Mem free failed with error code %i!\n", PxI32(result));
|
|
#if PX_DEBUG
|
|
mMemTracker.unregisterMemory(pinnedHostBuffer, false);
|
|
#endif
|
|
}
|
|
|
|
void CudaCtxMgr::clearDeviceBufferAsyncInternal(void* deviceBuffer, PxU32 numBytes, CUstream stream, PxI32 value)
|
|
{
|
|
if (!deviceBuffer)
|
|
return;
|
|
PxScopedCudaLock lock(*this);
|
|
PX_ASSERT(numBytes % 4 == 0);
|
|
PxCUresult result = getCudaContext()->memsetD32Async(CUdeviceptr(deviceBuffer), value, numBytes >> 2, stream);
|
|
if (result != CUDA_SUCCESS)
|
|
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "Mem set failed with error code %i!\n", PxI32(result));
|
|
}
|
|
|
|
void CudaCtxMgr::copyDToHAsyncInternal(void* hostBuffer, const void* deviceBuffer, PxU32 numBytes, CUstream stream)
|
|
{
|
|
if (!deviceBuffer || !hostBuffer)
|
|
return;
|
|
PxScopedCudaLock lock(*this);
|
|
PxCUresult result = getCudaContext()->memcpyDtoHAsync(hostBuffer, CUdeviceptr(deviceBuffer), numBytes, stream);
|
|
if (result != CUDA_SUCCESS)
|
|
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "copyDtoHAsync set failed with error code %i!\n", PxI32(result));
|
|
}
|
|
void CudaCtxMgr::copyHToDAsyncInternal(void* deviceBuffer, const void* hostBuffer, PxU32 numBytes, CUstream stream)
|
|
{
|
|
if (!deviceBuffer || !hostBuffer)
|
|
return;
|
|
PxScopedCudaLock lock(*this);
|
|
PxCUresult result = getCudaContext()->memcpyHtoDAsync(CUdeviceptr(deviceBuffer), hostBuffer, numBytes, stream);
|
|
if (result != CUDA_SUCCESS)
|
|
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "copyHtoDAsync set failed with error code %i!\n", PxI32(result));
|
|
}
|
|
void CudaCtxMgr::copyDToDAsyncInternal(void* dstDeviceBuffer, const void* srcDeviceBuffer, PxU32 numBytes, CUstream stream)
|
|
{
|
|
if (!srcDeviceBuffer || !dstDeviceBuffer)
|
|
return;
|
|
PxScopedCudaLock lock(*this);
|
|
PxCUresult result = getCudaContext()->memcpyDtoDAsync(CUdeviceptr(dstDeviceBuffer), CUdeviceptr(srcDeviceBuffer), numBytes, stream);
|
|
if (result != CUDA_SUCCESS)
|
|
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "copyDtoDAsync set failed with error code %i!\n", PxI32(result));
|
|
}
|
|
|
|
void CudaCtxMgr::copyDToHInternal(void* hostBuffer, const void* deviceBuffer, PxU32 numBytes)
|
|
{
|
|
if (!deviceBuffer || !hostBuffer)
|
|
return;
|
|
PxScopedCudaLock lock(*this);
|
|
PxCUresult result = getCudaContext()->memcpyDtoH(hostBuffer, CUdeviceptr(deviceBuffer), numBytes);
|
|
if (result != CUDA_SUCCESS)
|
|
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "copyDtoH set failed with error code %i!\n", PxI32(result));
|
|
}
|
|
void CudaCtxMgr::copyHToDInternal(void* deviceBuffer, const void* hostBuffer, PxU32 numBytes)
|
|
{
|
|
if (!deviceBuffer || !hostBuffer)
|
|
return;
|
|
PxScopedCudaLock lock(*this);
|
|
PxCUresult result = getCudaContext()->memcpyHtoD(CUdeviceptr(deviceBuffer), hostBuffer, numBytes);
|
|
if (result != CUDA_SUCCESS)
|
|
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "copyHtoD set failed with error code %i!\n", PxI32(result));
|
|
}
|
|
|
|
void CudaCtxMgr::memsetD8AsyncInternal(void* dstDeviceBuffer, const PxU8& value, PxU32 numBytes, CUstream stream)
|
|
{
|
|
if (!dstDeviceBuffer)
|
|
return;
|
|
PxScopedCudaLock lock(*this);
|
|
PxCUresult result = getCudaContext()->memsetD8Async(CUdeviceptr(dstDeviceBuffer), value, numBytes, stream);
|
|
if (result != CUDA_SUCCESS)
|
|
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "Memset failed with error code %i!\n", PxI32(result));
|
|
|
|
}
|
|
|
|
void CudaCtxMgr::memsetD32AsyncInternal(void* dstDeviceBuffer, const PxU32& value, PxU32 numIntegers, CUstream stream)
|
|
{
|
|
if (!dstDeviceBuffer)
|
|
return;
|
|
PxScopedCudaLock lock(*this);
|
|
PxCUresult result = getCudaContext()->memsetD32Async(CUdeviceptr(dstDeviceBuffer), value, numIntegers, stream);
|
|
if (result != CUDA_SUCCESS)
|
|
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "Memset failed with error code %i!\n", PxI32(result));
|
|
}
|
|
|
|
|
|
bool CudaCtxMgr::contextIsValid() const
|
|
{
|
|
return mIsValid;
|
|
}
|
|
bool CudaCtxMgr::supportsArchSM10() const
|
|
{
|
|
return mIsValid;
|
|
}
|
|
bool CudaCtxMgr::supportsArchSM11() const
|
|
{
|
|
return mIsValid && (mComputeCapMinor >= 1 || mComputeCapMajor > 1);
|
|
}
|
|
bool CudaCtxMgr::supportsArchSM12() const
|
|
{
|
|
return mIsValid && (mComputeCapMinor >= 2 || mComputeCapMajor > 1);
|
|
}
|
|
bool CudaCtxMgr::supportsArchSM13() const
|
|
{
|
|
return mIsValid && (mComputeCapMinor >= 3 || mComputeCapMajor > 1);
|
|
}
|
|
bool CudaCtxMgr::supportsArchSM20() const
|
|
{
|
|
return mIsValid && mComputeCapMajor >= 2;
|
|
}
|
|
bool CudaCtxMgr::supportsArchSM30() const
|
|
{
|
|
return mIsValid && mComputeCapMajor >= 3;
|
|
}
|
|
bool CudaCtxMgr::supportsArchSM35() const
|
|
{
|
|
return mIsValid && ((mComputeCapMajor > 3) || (mComputeCapMajor == 3 && mComputeCapMinor >= 5));
|
|
}
|
|
bool CudaCtxMgr::supportsArchSM50() const
|
|
{
|
|
return mIsValid && mComputeCapMajor >= 5;
|
|
}
|
|
bool CudaCtxMgr::supportsArchSM52() const
|
|
{
|
|
return mIsValid && ((mComputeCapMajor > 5) || (mComputeCapMajor == 5 && mComputeCapMinor >= 2));
|
|
}
|
|
bool CudaCtxMgr::supportsArchSM60() const
|
|
{
|
|
return mIsValid && mComputeCapMajor >= 6;
|
|
}
|
|
|
|
bool CudaCtxMgr::isIntegrated() const
|
|
{
|
|
return mIsValid && mIsIntegrated;
|
|
}
|
|
bool CudaCtxMgr::canMapHostMemory() const
|
|
{
|
|
return mIsValid && mCanMapHost;
|
|
}
|
|
int CudaCtxMgr::getDriverVersion() const
|
|
{
|
|
return mDriverVersion;
|
|
}
|
|
size_t CudaCtxMgr::getDeviceTotalMemBytes() const
|
|
{
|
|
return mTotalMemBytes;
|
|
}
|
|
int CudaCtxMgr::getMultiprocessorCount() const
|
|
{
|
|
return mMultiprocessorCount;
|
|
}
|
|
int CudaCtxMgr::getSharedMemPerBlock() const
|
|
{
|
|
return mSharedMemPerBlock;
|
|
}
|
|
int CudaCtxMgr::getSharedMemPerMultiprocessor() const
|
|
{
|
|
return mSharedMemPerMultiprocessor;
|
|
}
|
|
unsigned int CudaCtxMgr::getMaxThreadsPerBlock() const
|
|
{
|
|
return (unsigned int)mMaxThreadsPerBlock;
|
|
}
|
|
unsigned int CudaCtxMgr::getClockRate() const
|
|
{
|
|
return (unsigned int)mClockRate;
|
|
}
|
|
|
|
const char* CudaCtxMgr::getDeviceName() const
|
|
{
|
|
if (mIsValid)
|
|
{
|
|
return mDeviceName;
|
|
}
|
|
else
|
|
{
|
|
return "Invalid";
|
|
}
|
|
}
|
|
|
|
CUdevice CudaCtxMgr::getDevice() const
|
|
{
|
|
if (mIsValid)
|
|
{
|
|
return mDevHandle;
|
|
}
|
|
else
|
|
{
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
void CudaCtxMgr::setUsingConcurrentStreams(bool value)
|
|
{
|
|
mUsingConcurrentStreams = value;
|
|
}
|
|
|
|
bool CudaCtxMgr::getUsingConcurrentStreams() const
|
|
{
|
|
return mUsingConcurrentStreams;
|
|
}
|
|
|
|
void CudaCtxMgr::getDeviceMemoryInfo(size_t& free, size_t& total) const
|
|
{
|
|
cuMemGetInfo(&free, &total);
|
|
}
|
|
|
|
#define CUT_SAFE_CALL(call) { CUresult ret = call; \
|
|
if( CUDA_SUCCESS != ret ) { PX_ASSERT(0); } }
|
|
|
|
/* If a context is not provided, an ordinal must be given */
|
|
CudaCtxMgr::CudaCtxMgr(const PxCudaContextManagerDesc& desc, PxErrorCallback& errorCallback, bool launchSynchronous)
|
|
: mOwnContext(false)
|
|
, mCudaCtx(NULL)
|
|
, mUsingConcurrentStreams(true)
|
|
#if PX_DEBUG
|
|
, mPushPopCount(0)
|
|
#endif
|
|
{
|
|
CUresult status;
|
|
mIsValid = false;
|
|
mDeviceName[0] = 0;
|
|
|
|
if (safeDelayImport(errorCallback) == false)
|
|
{
|
|
char buffer[256];
|
|
physx::Pxsnprintf(buffer, 256, "NVIDIA Release %u.%u graphics driver and above is required for GPU acceleration.", NV_DRIVER_MAJOR_VERSION, NV_DRIVER_MINOR_VERSION);
|
|
errorCallback.reportError(PxErrorCode::eDEBUG_WARNING, buffer, PX_FL);
|
|
return;
|
|
}
|
|
|
|
if (desc.ctx == 0)
|
|
{
|
|
int flags = CU_CTX_LMEM_RESIZE_TO_MAX | CU_CTX_SCHED_BLOCKING_SYNC | CU_CTX_MAP_HOST;
|
|
class FoundationErrorReporter : public PxErrorCallback
|
|
{
|
|
public:
|
|
FoundationErrorReporter(PxErrorCallback& ec)
|
|
: errorCallback(&ec)
|
|
{
|
|
}
|
|
|
|
virtual void reportError(PxErrorCode::Enum code, const char* message, const char* file, int line) PX_OVERRIDE
|
|
{
|
|
errorCallback->reportError(code, message, file, line);
|
|
}
|
|
|
|
private:
|
|
PxErrorCallback* errorCallback;
|
|
} foundationErrorReporter(errorCallback);
|
|
|
|
int devOrdinal = desc.deviceOrdinal;
|
|
if (desc.deviceOrdinal < 0)
|
|
{
|
|
devOrdinal = PhysXDeviceSettings::getSuggestedCudaDeviceOrdinal(foundationErrorReporter);
|
|
}
|
|
|
|
if (devOrdinal < 0)
|
|
{
|
|
errorCallback.reportError(PxErrorCode::eDEBUG_INFO, "No PhysX capable GPU suggested.", PX_FL);
|
|
return;
|
|
}
|
|
|
|
status = cuInit(0);
|
|
if (CUDA_SUCCESS != status)
|
|
{
|
|
char buffer[128];
|
|
physx::Pxsnprintf(buffer, 128, "cuInit failed with error code %i", status);
|
|
errorCallback.reportError(PxErrorCode::eDEBUG_WARNING, buffer, PX_FL);
|
|
return;
|
|
}
|
|
|
|
{
|
|
status = cuDeviceGet(&mDevHandle, devOrdinal);
|
|
if (CUDA_SUCCESS != status)
|
|
{
|
|
errorCallback.reportError(PxErrorCode::eDEBUG_WARNING, "cuDeviceGet failed",__FILE__,__LINE__);
|
|
return;
|
|
}
|
|
|
|
status = cuCtxCreate(&mCtx, (unsigned int)flags, mDevHandle);
|
|
if (CUDA_SUCCESS != status)
|
|
{
|
|
const size_t bufferSize = 128;
|
|
char errorMsg[bufferSize];
|
|
physx::Pxsnprintf(errorMsg, bufferSize, "cuCtxCreate failed with error %i.", status);
|
|
errorCallback.reportError(PxErrorCode::eDEBUG_WARNING, errorMsg, PX_FL);
|
|
return;
|
|
}
|
|
mOwnContext = true;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
mCtx = *desc.ctx;
|
|
status = cuCtxGetDevice(&mDevHandle);
|
|
if (CUDA_SUCCESS != status)
|
|
{
|
|
errorCallback.reportError(PxErrorCode::eDEBUG_WARNING, "cuCtxGetDevice failed",__FILE__,__LINE__);
|
|
return;
|
|
}
|
|
}
|
|
|
|
// create cuda context wrapper
|
|
mCudaCtx = createCudaContext(mDevHandle, desc.deviceAllocator, launchSynchronous);
|
|
|
|
// Verify we can at least allocate a CUDA event from this context
|
|
CUevent testEvent;
|
|
if (CUDA_SUCCESS == mCudaCtx->eventCreate(&testEvent, 0))
|
|
{
|
|
mCudaCtx->eventDestroy(testEvent);
|
|
}
|
|
else
|
|
{
|
|
errorCallback.reportError(PxErrorCode::eDEBUG_WARNING, "CUDA context validation failed",__FILE__,__LINE__);
|
|
return;
|
|
}
|
|
|
|
status = cuDeviceGetName(mDeviceName, sizeof(mDeviceName), mDevHandle);
|
|
if (CUDA_SUCCESS != status)
|
|
{
|
|
errorCallback.reportError(PxErrorCode::eDEBUG_WARNING, "cuDeviceGetName failed",__FILE__,__LINE__);
|
|
return;
|
|
}
|
|
|
|
cuDeviceGetAttribute(&mSharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, mDevHandle);
|
|
cuDeviceGetAttribute(&mSharedMemPerMultiprocessor, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR, mDevHandle);
|
|
cuDeviceGetAttribute(&mClockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, mDevHandle);
|
|
cuDeviceGetAttribute(&mComputeCapMajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, mDevHandle);
|
|
cuDeviceGetAttribute(&mComputeCapMinor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, mDevHandle);
|
|
cuDeviceGetAttribute(&mIsIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, mDevHandle);
|
|
cuDeviceGetAttribute(&mCanMapHost, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, mDevHandle);
|
|
cuDeviceGetAttribute(&mMultiprocessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, mDevHandle);
|
|
cuDeviceGetAttribute(&mMaxThreadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, mDevHandle);
|
|
|
|
status = cuDeviceTotalMem((size_t*)&mTotalMemBytes, mDevHandle);
|
|
if (CUDA_SUCCESS != status)
|
|
{
|
|
errorCallback.reportError(PxErrorCode::eDEBUG_WARNING, "cuDeviceTotalMem failed",__FILE__,__LINE__);
|
|
return;
|
|
}
|
|
|
|
// minimum compute capability is MIN_SM_MAJOR_VERSION.MIN_SM_MINOR_VERSION
|
|
if ((mComputeCapMajor < MIN_SM_MAJOR_VERSION) ||
|
|
(mComputeCapMajor == MIN_SM_MAJOR_VERSION && mComputeCapMinor < MIN_SM_MINOR_VERSION))
|
|
{
|
|
char buffer[256];
|
|
physx::Pxsnprintf(buffer, 256, "Minimum GPU compute capability %d.%d is required", MIN_SM_MAJOR_VERSION, MIN_SM_MINOR_VERSION);
|
|
errorCallback.reportError(PxErrorCode::eDEBUG_WARNING,buffer,__FILE__,__LINE__);
|
|
return;
|
|
}
|
|
|
|
mContextRefCountTls = PxTlsAlloc();
|
|
mIsValid = true;
|
|
|
|
// Formally load the CUDA modules, get CUmodule handles
|
|
{
|
|
PxScopedCudaLock lock(*this);
|
|
const PxU32 moduleTableSize = PxGpuGetCudaModuleTableSize();
|
|
void** moduleTable = PxGpuGetCudaModuleTable();
|
|
mCuModules.resize(moduleTableSize, NULL);
|
|
for (PxU32 i = 0 ; i < moduleTableSize ; ++i)
|
|
{
|
|
CUresult ret = CUDA_ERROR_UNKNOWN;
|
|
|
|
// Make sure that moduleTable[i] is not null
|
|
if (moduleTable[i])
|
|
{
|
|
ret = mCudaCtx->moduleLoadDataEx(&mCuModules[i], moduleTable[i], 0, NULL, NULL);
|
|
}
|
|
|
|
if (ret != CUDA_SUCCESS && ret != CUDA_ERROR_NO_BINARY_FOR_GPU)
|
|
{
|
|
const size_t bufferSize = 256;
|
|
char errorMsg[bufferSize];
|
|
physx::Pxsnprintf(errorMsg, bufferSize, "Failed to load CUDA module data. Cuda error code %i.\n", ret);
|
|
|
|
PxGetErrorCallback()->reportError(PxErrorCode::eINTERNAL_ERROR, errorMsg, PX_FL);
|
|
mCuModules[i] = NULL;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Some driver version mismatches can cause delay import crashes. Load NVCUDA.dll
|
|
* manually, verify its version number, then allow delay importing to bind all the
|
|
* APIs.
|
|
*/
|
|
bool CudaCtxMgr::safeDelayImport(PxErrorCallback& errorCallback)
|
|
{
|
|
#if PX_WIN32 || PX_WIN64
|
|
HMODULE hCudaDriver = LoadLibrary("nvcuda.dll");
|
|
#elif PX_LINUX
|
|
void* hCudaDriver = dlopen("libcuda.so.1", RTLD_NOW);
|
|
#endif
|
|
if (!hCudaDriver)
|
|
{
|
|
errorCallback.reportError(PxErrorCode::eINTERNAL_ERROR, "nvcuda.dll not found or could not be loaded.", PX_FL);
|
|
return false;
|
|
}
|
|
|
|
typedef CUresult(CUDAAPI * pfnCuDriverGetVersion_t)(int*);
|
|
pfnCuDriverGetVersion_t pfnCuDriverGetVersion = (pfnCuDriverGetVersion_t) GetProcAddress(hCudaDriver, "cuDriverGetVersion");
|
|
if (!pfnCuDriverGetVersion)
|
|
{
|
|
errorCallback.reportError(PxErrorCode::eINTERNAL_ERROR, "cuDriverGetVersion missing in nvcuda.dll.", PX_FL);
|
|
return false;
|
|
}
|
|
|
|
#if PX_A64
|
|
CUresult status = cuDriverGetVersion(&mDriverVersion);
|
|
#else
|
|
CUresult status = pfnCuDriverGetVersion(&mDriverVersion);
|
|
#endif
|
|
|
|
if (status != CUDA_SUCCESS)
|
|
{
|
|
errorCallback.reportError(PxErrorCode::eINTERNAL_ERROR, "Retrieving CUDA driver version failed.", PX_FL);
|
|
return false;
|
|
}
|
|
|
|
// Check that the Cuda toolkit used meets the minimum version
|
|
// If the Cuda toolkit has changed, refer to https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html
|
|
// Make the necessary changes to Cuda toolkit definitions
|
|
PX_COMPILE_TIME_ASSERT(CUDA_VERSION >= MIN_CUDA_VERSION);
|
|
|
|
// Check whether Cuda driver version meets the min requirement
|
|
if (mDriverVersion < MIN_CUDA_VERSION)
|
|
{
|
|
char buffer[256];
|
|
physx::Pxsnprintf(buffer, 256, "CUDA driver version is %u, expected driver version is at least %u.", mDriverVersion, MIN_CUDA_VERSION);
|
|
errorCallback.reportError(PxErrorCode::eINTERNAL_ERROR, buffer, __FILE__,__LINE__);
|
|
return false;
|
|
}
|
|
|
|
/* Now trigger delay import and API binding */
|
|
status = cuDriverGetVersion(&mDriverVersion);
|
|
if (status != CUDA_SUCCESS)
|
|
{
|
|
errorCallback.reportError(PxErrorCode::eINTERNAL_ERROR, "Failed to bind CUDA API.", PX_FL);
|
|
return false;
|
|
}
|
|
|
|
/* Not strictly necessary, but good practice */
|
|
#if PX_WIN32 | PX_WIN64
|
|
FreeLibrary(hCudaDriver);
|
|
#elif PX_LINUX
|
|
dlclose(hCudaDriver);
|
|
#endif
|
|
|
|
return true;
|
|
}
|
|
|
|
void CudaCtxMgr::release()
|
|
{
|
|
PX_DELETE_THIS;
|
|
}
|
|
|
|
CudaCtxMgr::~CudaCtxMgr()
|
|
{
|
|
if (mCudaCtx)
|
|
{
|
|
// unload CUDA modules
|
|
{
|
|
PxScopedCudaLock lock(*this);
|
|
for(PxU32 i = 0; i < mCuModules.size(); i++)
|
|
{
|
|
CUresult ret = mCudaCtx->moduleUnload(mCuModules[i]);
|
|
if(ret != CUDA_SUCCESS)
|
|
{
|
|
char msg[128];
|
|
physx::Pxsnprintf(msg, 128, "Failed to unload CUDA module data, returned %i.", ret);
|
|
PxGetErrorCallback()->reportError(PxErrorCode::eINTERNAL_ERROR, msg, PX_FL);
|
|
}
|
|
}
|
|
}
|
|
|
|
mCudaCtx->release();
|
|
mCudaCtx = NULL;
|
|
}
|
|
|
|
if (mOwnContext)
|
|
{
|
|
CUT_SAFE_CALL(cuCtxDestroy(mCtx));
|
|
}
|
|
|
|
PxTlsFree(mContextRefCountTls);
|
|
|
|
#if PX_DEBUG
|
|
PX_ASSERT(mPushPopCount == 0);
|
|
#endif
|
|
}
|
|
|
|
void CudaCtxMgr::acquireContext()
|
|
{
|
|
bool result = tryAcquireContext();
|
|
PX_ASSERT(result);
|
|
PX_UNUSED(result);
|
|
}
|
|
|
|
bool CudaCtxMgr::tryAcquireContext()
|
|
{
|
|
// AD: we directly store the counter in the per-thread value (instead of using a pointer-to-value.)
|
|
// Using size_t because we have a pointer's width to play with, so the type will potentially depend on the platform.
|
|
// All the values are initialized to NULL at PxTlsAlloc() and for any newly created thread it will be NULL as well.
|
|
// So even if a thread hits this code for the first time, we know it's zero, and then we start by placing the correct refcount
|
|
// below in the set call.
|
|
size_t refCount = PxTlsGetValue(mContextRefCountTls);
|
|
|
|
CUresult result = CUDA_SUCCESS;
|
|
|
|
#if PX_DEBUG
|
|
result = cuCtxPushCurrent(mCtx);
|
|
PxAtomicIncrement(&mPushPopCount);
|
|
#else
|
|
if (refCount == 0)
|
|
{
|
|
result = cuCtxPushCurrent(mCtx);
|
|
}
|
|
#endif
|
|
PxTlsSetValue(mContextRefCountTls, ++refCount);
|
|
|
|
return result == CUDA_SUCCESS;
|
|
}
|
|
|
|
void CudaCtxMgr::releaseContext()
|
|
{
|
|
size_t refCount = PxTlsGetValue(mContextRefCountTls);
|
|
|
|
#if PX_DEBUG
|
|
CUcontext ctx = 0;
|
|
CUT_SAFE_CALL(cuCtxPopCurrent(&ctx));
|
|
PxAtomicDecrement(&mPushPopCount);
|
|
#else
|
|
if (--refCount == 0)
|
|
{
|
|
CUcontext ctx = 0;
|
|
CUT_SAFE_CALL(cuCtxPopCurrent(&ctx));
|
|
}
|
|
#endif
|
|
PxTlsSetValue(mContextRefCountTls, refCount);
|
|
}
|
|
|
|
class CudaCtx : public PxCudaContext, public PxUserAllocated
|
|
{
|
|
private:
|
|
CUresult mLastResult;
|
|
bool mLaunchSynchronous;
|
|
bool mIsInAbortMode;
|
|
|
|
public:
|
|
CudaCtx(PxDeviceAllocatorCallback* callback, bool launchSynchronous);
|
|
~CudaCtx();
|
|
|
|
// PxCudaContext
|
|
void release() PX_OVERRIDE PX_FINAL;
|
|
PxCUresult memAlloc(CUdeviceptr* dptr, size_t bytesize) PX_OVERRIDE PX_FINAL;
|
|
PxCUresult memFree(CUdeviceptr dptr) PX_OVERRIDE PX_FINAL;
|
|
PxCUresult memHostAlloc(void** pp, size_t bytesize, unsigned int Flags) PX_OVERRIDE PX_FINAL;
|
|
PxCUresult memFreeHost(void* p) PX_OVERRIDE PX_FINAL;
|
|
PxCUresult memHostGetDevicePointer(CUdeviceptr* pdptr, void* p, unsigned int Flags) PX_OVERRIDE PX_FINAL;
|
|
PxCUresult moduleLoadDataEx(CUmodule* module, const void* image, unsigned int numOptions, PxCUjit_option* options, void** optionValues) PX_OVERRIDE PX_FINAL;
|
|
PxCUresult moduleGetFunction(CUfunction* hfunc, CUmodule hmod, const char* name) PX_OVERRIDE PX_FINAL;
|
|
PxCUresult moduleUnload(CUmodule hmod) PX_OVERRIDE PX_FINAL;
|
|
PxCUresult streamCreate(CUstream* phStream, unsigned int Flags) PX_OVERRIDE PX_FINAL;
|
|
PxCUresult streamCreateWithPriority(CUstream* phStream, unsigned int flags, int priority) PX_OVERRIDE PX_FINAL;
|
|
PxCUresult streamFlush(CUstream hStream) PX_OVERRIDE PX_FINAL;
|
|
PxCUresult streamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags) PX_OVERRIDE PX_FINAL;
|
|
PxCUresult streamWaitEvent(CUstream hStream, CUevent hEvent) PX_OVERRIDE PX_FINAL;
|
|
PxCUresult streamDestroy(CUstream hStream) PX_OVERRIDE PX_FINAL;
|
|
PxCUresult streamSynchronize(CUstream hStream) PX_OVERRIDE PX_FINAL;
|
|
PxCUresult eventCreate(CUevent* phEvent, unsigned int Flags) PX_OVERRIDE PX_FINAL;
|
|
PxCUresult eventRecord(CUevent hEvent, CUstream hStream) PX_OVERRIDE PX_FINAL;
|
|
PxCUresult eventQuery(CUevent hEvent) PX_OVERRIDE PX_FINAL;
|
|
PxCUresult eventSynchronize(CUevent hEvent) PX_OVERRIDE PX_FINAL;
|
|
PxCUresult eventDestroy(CUevent hEvent) PX_OVERRIDE PX_FINAL;
|
|
|
|
PxCUresult launchKernel(
|
|
CUfunction f,
|
|
PxU32 gridDimX, PxU32 gridDimY, PxU32 gridDimZ,
|
|
PxU32 blockDimX, PxU32 blockDimY, PxU32 blockDimZ,
|
|
PxU32 sharedMemBytes,
|
|
CUstream hStream,
|
|
PxCudaKernelParam* kernelParams,
|
|
size_t kernelParamsSizeInBytes,
|
|
void** extra,
|
|
const char* file,
|
|
int line
|
|
) PX_OVERRIDE PX_FINAL;
|
|
|
|
PxCUresult launchKernel(
|
|
CUfunction f,
|
|
PxU32 gridDimX, PxU32 gridDimY, PxU32 gridDimZ,
|
|
PxU32 blockDimX, PxU32 blockDimY, PxU32 blockDimZ,
|
|
PxU32 sharedMemBytes,
|
|
CUstream hStream,
|
|
void** kernelParams,
|
|
void** extra,
|
|
const char* file,
|
|
int line
|
|
) PX_OVERRIDE PX_FINAL;
|
|
|
|
PxCUresult memcpyDtoH(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount) PX_OVERRIDE PX_FINAL;
|
|
PxCUresult memcpyDtoHAsync(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream) PX_OVERRIDE PX_FINAL;
|
|
PxCUresult memcpyHtoD(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount) PX_OVERRIDE PX_FINAL;
|
|
PxCUresult memcpyHtoDAsync(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount, CUstream hStream) PX_OVERRIDE PX_FINAL;
|
|
PxCUresult memcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount) PX_OVERRIDE PX_FINAL;
|
|
PxCUresult memcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream) PX_OVERRIDE PX_FINAL;
|
|
PxCUresult memcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream) PX_OVERRIDE PX_FINAL;
|
|
PxCUresult memsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream) PX_OVERRIDE PX_FINAL;
|
|
PxCUresult memsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream) PX_OVERRIDE PX_FINAL;
|
|
PxCUresult memsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N) PX_OVERRIDE PX_FINAL;
|
|
PxCUresult memsetD16(CUdeviceptr dstDevice, unsigned short uh, size_t N) PX_OVERRIDE PX_FINAL;
|
|
PxCUresult memsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N) PX_OVERRIDE PX_FINAL;
|
|
PxCUresult getLastError() PX_OVERRIDE PX_FINAL { return isInAbortMode() ? CUDA_ERROR_OUT_OF_MEMORY : mLastResult; }
|
|
|
|
void setAbortMode(bool abort) PX_OVERRIDE PX_FINAL;
|
|
bool isInAbortMode() PX_OVERRIDE PX_FINAL { return mIsInAbortMode; }
|
|
|
|
//~PxCudaContext
|
|
};
|
|
|
|
CudaCtx::CudaCtx(PxDeviceAllocatorCallback* callback, bool launchSynchronous)
|
|
{
|
|
mLastResult = CUDA_SUCCESS;
|
|
mAllocatorCallback = callback;
|
|
mIsInAbortMode = false;
|
|
#if FORCE_LAUNCH_SYNCHRONOUS
|
|
PX_UNUSED(launchSynchronous);
|
|
mLaunchSynchronous = true;
|
|
#else
|
|
mLaunchSynchronous = launchSynchronous;
|
|
#endif
|
|
}
|
|
|
|
CudaCtx::~CudaCtx()
|
|
{
|
|
|
|
}
|
|
|
|
void CudaCtx::release()
|
|
{
|
|
PX_DELETE_THIS;
|
|
}
|
|
|
|
PxCUresult CudaCtx::memAlloc(CUdeviceptr *dptr, size_t bytesize)
|
|
{
|
|
if (mIsInAbortMode)
|
|
{
|
|
*dptr = NULL;
|
|
return mLastResult;
|
|
}
|
|
|
|
mLastResult = cuMemAlloc(dptr, bytesize);
|
|
#if PX_STOMP_ALLOCATED_MEMORY
|
|
if(*dptr && bytesize > 0)
|
|
{
|
|
cuCtxSynchronize();
|
|
PxCUresult result = memsetD8(*dptr, PxU8(0xcd), bytesize);
|
|
PX_ASSERT(result == CUDA_SUCCESS);
|
|
PX_UNUSED(result);
|
|
cuCtxSynchronize();
|
|
}
|
|
#endif
|
|
return mLastResult;
|
|
}
|
|
|
|
PxCUresult CudaCtx::memFree(CUdeviceptr dptr)
|
|
{
|
|
if ((void*)dptr == NULL)
|
|
return mLastResult;
|
|
|
|
return cuMemFree(dptr);
|
|
}
|
|
|
|
PxCUresult CudaCtx::memHostAlloc(void** pp, size_t bytesize, unsigned int Flags)
|
|
{
|
|
CUresult result = cuMemHostAlloc(pp, bytesize, Flags);
|
|
#if PX_STOMP_ALLOCATED_MEMORY
|
|
if(*pp != NULL && bytesize > 0)
|
|
{
|
|
PxMemSet(*pp, PxI32(0xcd), PxU32(bytesize));
|
|
}
|
|
#endif
|
|
return result;
|
|
}
|
|
|
|
PxCUresult CudaCtx::memFreeHost(void* p)
|
|
{
|
|
return cuMemFreeHost(p);
|
|
}
|
|
|
|
PxCUresult CudaCtx::memHostGetDevicePointer(CUdeviceptr* pdptr, void* p, unsigned int Flags)
|
|
{
|
|
if (!p)
|
|
{
|
|
*pdptr = reinterpret_cast<CUdeviceptr>(p);
|
|
return CUDA_SUCCESS;
|
|
}
|
|
return cuMemHostGetDevicePointer(pdptr, p, Flags);
|
|
}
|
|
|
|
PxCUresult CudaCtx::moduleLoadDataEx(CUmodule* module, const void* image, unsigned int numOptions, PxCUjit_option* options, void** optionValues)
|
|
{
|
|
return cuModuleLoadDataEx(module, image, numOptions, (CUjit_option*)options, optionValues);
|
|
}
|
|
|
|
PxCUresult CudaCtx::moduleGetFunction(CUfunction* hfunc, CUmodule hmod, const char* name)
|
|
{
|
|
return cuModuleGetFunction(hfunc, hmod, name);
|
|
}
|
|
|
|
PxCUresult CudaCtx::moduleUnload(CUmodule hmod)
|
|
{
|
|
return cuModuleUnload(hmod);
|
|
}
|
|
|
|
PxCUresult CudaCtx::streamCreate(CUstream* phStream, unsigned int Flags)
|
|
{
|
|
if (mIsInAbortMode)
|
|
{
|
|
*phStream = NULL;
|
|
return mLastResult;
|
|
}
|
|
|
|
#if !USE_DEFAULT_CUDA_STREAM
|
|
mLastResult = cuStreamCreate(phStream, Flags);
|
|
#else
|
|
PX_UNUSED(Flags);
|
|
*phStream = CUstream(CU_STREAM_DEFAULT);
|
|
mLastResult = CUDA_SUCCESS;
|
|
#endif
|
|
|
|
return mLastResult;
|
|
}
|
|
|
|
PxCUresult CudaCtx::streamCreateWithPriority(CUstream* phStream, unsigned int flags, int priority)
|
|
{
|
|
if (mIsInAbortMode)
|
|
{
|
|
*phStream = NULL;
|
|
return mLastResult;
|
|
}
|
|
|
|
#if !USE_DEFAULT_CUDA_STREAM
|
|
mLastResult = cuStreamCreateWithPriority(phStream, flags, priority);
|
|
#else
|
|
PX_UNUSED(flags);
|
|
PX_UNUSED(priority);
|
|
*phStream = CUstream(CU_STREAM_DEFAULT);
|
|
mLastResult = CUDA_SUCCESS;
|
|
#endif
|
|
|
|
return mLastResult;
|
|
}
|
|
|
|
PxCUresult CudaCtx::streamFlush(CUstream hStream)
|
|
{
|
|
if (mIsInAbortMode)
|
|
return mLastResult;
|
|
|
|
// AD: don't remember the error, because this can return CUDA_ERROR_NOT_READY which is not really an error.
|
|
// We just misuse streamquery to push the buffer anyway.
|
|
return cuStreamQuery(hStream);
|
|
}
|
|
|
|
PxCUresult CudaCtx::streamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags)
|
|
{
|
|
if (mIsInAbortMode)
|
|
return mLastResult;
|
|
|
|
mLastResult = cuStreamWaitEvent(hStream, hEvent, Flags);
|
|
return mLastResult;
|
|
}
|
|
|
|
PxCUresult CudaCtx::streamWaitEvent(CUstream hStream, CUevent hEvent)
|
|
{
|
|
return streamWaitEvent(hStream, hEvent, 0);
|
|
}
|
|
|
|
PxCUresult CudaCtx::streamDestroy(CUstream hStream)
|
|
{
|
|
PX_UNUSED(hStream);
|
|
#if !USE_DEFAULT_CUDA_STREAM
|
|
if (hStream == NULL)
|
|
return mLastResult;
|
|
return cuStreamDestroy(hStream);
|
|
#else
|
|
return CUDA_SUCCESS;
|
|
#endif
|
|
}
|
|
|
|
PxCUresult CudaCtx::streamSynchronize(CUstream hStream)
|
|
{
|
|
if (mIsInAbortMode)
|
|
return mLastResult;
|
|
|
|
mLastResult = cuStreamSynchronize(hStream);
|
|
return mLastResult;
|
|
}
|
|
|
|
PxCUresult CudaCtx::eventCreate(CUevent* phEvent, unsigned int Flags)
|
|
{
|
|
if (mIsInAbortMode)
|
|
{
|
|
*phEvent = NULL;
|
|
return mLastResult;
|
|
}
|
|
|
|
mLastResult = cuEventCreate(phEvent, Flags);
|
|
return mLastResult;
|
|
}
|
|
|
|
PxCUresult CudaCtx::eventRecord(CUevent hEvent, CUstream hStream)
|
|
{
|
|
if (mIsInAbortMode)
|
|
return mLastResult;
|
|
|
|
mLastResult = cuEventRecord(hEvent, hStream);
|
|
return mLastResult;
|
|
}
|
|
|
|
PxCUresult CudaCtx::eventQuery(CUevent hEvent)
|
|
{
|
|
if (mIsInAbortMode)
|
|
return mLastResult;
|
|
|
|
mLastResult = cuEventQuery(hEvent);
|
|
return mLastResult;
|
|
}
|
|
|
|
PxCUresult CudaCtx::eventSynchronize(CUevent hEvent)
|
|
{
|
|
if (mIsInAbortMode)
|
|
return mLastResult;
|
|
|
|
mLastResult = cuEventSynchronize(hEvent);
|
|
return mLastResult;
|
|
}
|
|
|
|
PxCUresult CudaCtx::eventDestroy(CUevent hEvent)
|
|
{
|
|
if (hEvent == NULL)
|
|
return mLastResult;
|
|
|
|
return cuEventDestroy(hEvent);
|
|
}
|
|
|
|
PxCUresult CudaCtx::launchKernel(
|
|
CUfunction f,
|
|
PxU32 gridDimX, PxU32 gridDimY, PxU32 gridDimZ,
|
|
PxU32 blockDimX, PxU32 blockDimY, PxU32 blockDimZ,
|
|
PxU32 sharedMemBytes,
|
|
CUstream hStream,
|
|
PxCudaKernelParam* kernelParams,
|
|
size_t kernelParamsSizeInBytes,
|
|
void** extra,
|
|
const char* file,
|
|
int line
|
|
)
|
|
{
|
|
if (mIsInAbortMode)
|
|
return mLastResult;
|
|
|
|
//We allow CUDA_ERROR_INVALID_VALUE to be non-terminal error as this is sometimes hit
|
|
//when we launch an empty block
|
|
if (mLastResult == CUDA_SUCCESS || mLastResult == CUDA_ERROR_INVALID_VALUE)
|
|
{
|
|
const uint32_t kernelParamCount = (uint32_t)(kernelParamsSizeInBytes / sizeof(PxCudaKernelParam));
|
|
PX_ALLOCA(kernelParamsLocal, void*, kernelParamCount);
|
|
for (unsigned int paramIdx = 0u; paramIdx < kernelParamCount; paramIdx++)
|
|
{
|
|
kernelParamsLocal[paramIdx] = kernelParams[paramIdx].data;
|
|
}
|
|
mLastResult = cuLaunchKernel(
|
|
f,
|
|
gridDimX, gridDimY, gridDimZ,
|
|
blockDimX, blockDimY, blockDimZ,
|
|
sharedMemBytes,
|
|
hStream,
|
|
kernelParamsLocal,
|
|
extra
|
|
);
|
|
|
|
if (mLaunchSynchronous)
|
|
{
|
|
mLastResult = cuStreamSynchronize(hStream);
|
|
if (mLastResult != CUDA_SUCCESS)
|
|
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, file, line, "Launch failed!! Error: %i\n", mLastResult);
|
|
}
|
|
|
|
PX_ASSERT(mLastResult == CUDA_SUCCESS || mLastResult == CUDA_ERROR_INVALID_VALUE);
|
|
}
|
|
|
|
return mLastResult;
|
|
}
|
|
|
|
PxCUresult CudaCtx::launchKernel(
|
|
CUfunction f,
|
|
PxU32 gridDimX, PxU32 gridDimY, PxU32 gridDimZ,
|
|
PxU32 blockDimX, PxU32 blockDimY, PxU32 blockDimZ,
|
|
PxU32 sharedMemBytes,
|
|
CUstream hStream,
|
|
void** kernelParams,
|
|
void** extra,
|
|
const char* file,
|
|
int line
|
|
)
|
|
{
|
|
if (mIsInAbortMode)
|
|
return mLastResult;
|
|
|
|
//We allow CUDA_ERROR_INVALID_VALUE to be non-terminal error as this is sometimes hit
|
|
//when we launch an empty block
|
|
if (mLastResult == CUDA_SUCCESS || mLastResult == CUDA_ERROR_INVALID_VALUE)
|
|
{
|
|
mLastResult = cuLaunchKernel(
|
|
f,
|
|
gridDimX, gridDimY, gridDimZ,
|
|
blockDimX, blockDimY, blockDimZ,
|
|
sharedMemBytes,
|
|
hStream,
|
|
kernelParams,
|
|
extra
|
|
);
|
|
|
|
if (mLaunchSynchronous)
|
|
{
|
|
mLastResult = cuStreamSynchronize(hStream);
|
|
if (mLastResult != CUDA_SUCCESS)
|
|
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, file, line, "Launch failed!! Error: %i\n", mLastResult);
|
|
}
|
|
|
|
PX_ASSERT(mLastResult == CUDA_SUCCESS || mLastResult == CUDA_ERROR_INVALID_VALUE);
|
|
}
|
|
|
|
return mLastResult;
|
|
}
|
|
|
|
PxCUresult CudaCtx::memcpyDtoH(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount)
|
|
{
|
|
if (mIsInAbortMode)
|
|
return mLastResult;
|
|
|
|
if (ByteCount > 0)
|
|
mLastResult = cuMemcpyDtoH(dstHost, srcDevice, ByteCount);
|
|
|
|
if (mLastResult != CUDA_SUCCESS)
|
|
{
|
|
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "copyDToH failed with error code %i!\n", PxI32(mLastResult));
|
|
}
|
|
|
|
return mLastResult;
|
|
|
|
}
|
|
|
|
PxCUresult CudaCtx::memcpyDtoHAsync(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream)
|
|
{
|
|
if (mIsInAbortMode)
|
|
return mLastResult;
|
|
|
|
if (ByteCount > 0)
|
|
{
|
|
mLastResult = cuMemcpyDtoHAsync(dstHost, srcDevice, ByteCount, hStream);
|
|
if (mLaunchSynchronous)
|
|
{
|
|
if (mLastResult != CUDA_SUCCESS)
|
|
{
|
|
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "memcpyDtoHAsync invalid parameters!! Error: %i\n", mLastResult);
|
|
}
|
|
mLastResult = cuStreamSynchronize(hStream);
|
|
if (mLastResult != CUDA_SUCCESS)
|
|
{
|
|
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "memcpyDtoHAsync failed!! Error: %i\n", mLastResult);
|
|
}
|
|
}
|
|
}
|
|
return mLastResult;
|
|
}
|
|
|
|
PxCUresult CudaCtx::memcpyHtoD(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount)
|
|
{
|
|
if (mIsInAbortMode)
|
|
return mLastResult;
|
|
|
|
if (ByteCount > 0)
|
|
{
|
|
mLastResult = cuMemcpyHtoD(dstDevice, srcHost, ByteCount);
|
|
if (mLastResult != CUDA_SUCCESS)
|
|
{
|
|
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "memcpyHtoD invalid parameters!! %i\n", mLastResult);
|
|
}
|
|
}
|
|
return mLastResult;
|
|
}
|
|
|
|
PxCUresult CudaCtx::memcpyHtoDAsync(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount, CUstream hStream)
|
|
{
|
|
if (mIsInAbortMode)
|
|
return mLastResult;
|
|
|
|
if (ByteCount > 0)
|
|
{
|
|
mLastResult = cuMemcpyHtoDAsync(dstDevice, srcHost, ByteCount, hStream);
|
|
if (mLaunchSynchronous)
|
|
{
|
|
if (mLastResult != CUDA_SUCCESS)
|
|
{
|
|
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "memcpyHtoDAsync invalid parameters!! Error: %i\n", mLastResult);
|
|
}
|
|
mLastResult = cuStreamSynchronize(hStream);
|
|
if (mLastResult != CUDA_SUCCESS)
|
|
{
|
|
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "memcpyHtoDAsync failed!! Error: %i\n", mLastResult);
|
|
}
|
|
}
|
|
}
|
|
return mLastResult;
|
|
}
|
|
|
|
PxCUresult CudaCtx::memcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream)
|
|
{
|
|
if (mIsInAbortMode)
|
|
return mLastResult;
|
|
|
|
if (ByteCount > 0)
|
|
{
|
|
mLastResult = cuMemcpyDtoDAsync(dstDevice, srcDevice, ByteCount, hStream);
|
|
if (mLaunchSynchronous)
|
|
{
|
|
if (mLastResult != CUDA_SUCCESS)
|
|
{
|
|
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "memcpyDtoDAsync invalid parameters!! Error: %i\n", mLastResult);
|
|
}
|
|
mLastResult = cuStreamSynchronize(hStream);
|
|
if (mLastResult != CUDA_SUCCESS)
|
|
{
|
|
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "memcpyDtoDAsync failed!! Error: %i\n", mLastResult);
|
|
}
|
|
}
|
|
}
|
|
return mLastResult;
|
|
}
|
|
|
|
PxCUresult CudaCtx::memcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount)
|
|
{
|
|
if (mIsInAbortMode)
|
|
return mLastResult;
|
|
|
|
if (ByteCount > 0)
|
|
{
|
|
mLastResult = cuMemcpyDtoD(dstDevice, srcDevice, ByteCount);
|
|
// synchronize to avoid race conditions.
|
|
// https://docs.nvidia.com/cuda/cuda-driver-api/api-sync-behavior.html#api-sync-behavior__memcpy
|
|
mLastResult = cuStreamSynchronize(0);
|
|
if (mLastResult != CUDA_SUCCESS)
|
|
{
|
|
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "memcpyDtoD invalid parameters!! Error: %i\n", mLastResult);
|
|
}
|
|
}
|
|
return mLastResult;
|
|
}
|
|
|
|
PxCUresult CudaCtx::memcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream)
|
|
{
|
|
if (mIsInAbortMode)
|
|
return mLastResult;
|
|
|
|
return cuMemcpyPeerAsync(dstDevice, dstContext, srcDevice, srcContext, ByteCount, hStream);
|
|
}
|
|
|
|
PxCUresult CudaCtx::memsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream)
|
|
{
|
|
if (mIsInAbortMode)
|
|
return mLastResult;
|
|
|
|
if (N > 0)
|
|
{
|
|
mLastResult = cuMemsetD32Async(dstDevice, ui, N, hStream);
|
|
if (mLaunchSynchronous)
|
|
{
|
|
PX_ASSERT(mLastResult == CUDA_SUCCESS);
|
|
if (mLastResult != CUDA_SUCCESS)
|
|
{
|
|
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "memsetD32Async invalid parameters!! Error: %i\n", mLastResult);
|
|
}
|
|
mLastResult = cuStreamSynchronize(hStream);
|
|
if (mLastResult != CUDA_SUCCESS)
|
|
{
|
|
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "memsetD32Async failed!! Error: %i\n", mLastResult);
|
|
}
|
|
}
|
|
}
|
|
return mLastResult;
|
|
}
|
|
|
|
PxCUresult CudaCtx::memsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream)
|
|
{
|
|
if (mIsInAbortMode)
|
|
return mLastResult;
|
|
|
|
if (N > 0)
|
|
{
|
|
mLastResult = cuMemsetD8Async(dstDevice, uc, N, hStream);
|
|
if (mLaunchSynchronous)
|
|
{
|
|
if (mLastResult!= CUDA_SUCCESS)
|
|
{
|
|
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "cuMemsetD8Async invalid parameters!! Error: %i\n", mLastResult);
|
|
}
|
|
mLastResult = cuStreamSynchronize(hStream);
|
|
if (mLastResult != CUDA_SUCCESS)
|
|
{
|
|
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "cuMemsetD8Async failed!! Error: %i\n", mLastResult);
|
|
}
|
|
}
|
|
}
|
|
return mLastResult;
|
|
}
|
|
|
|
PxCUresult CudaCtx::memsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N)
|
|
{
|
|
if (mIsInAbortMode)
|
|
return mLastResult;
|
|
|
|
if (N > 0)
|
|
{
|
|
mLastResult = cuMemsetD32(dstDevice, ui, N);
|
|
if (mLastResult != CUDA_SUCCESS)
|
|
{
|
|
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "memsetD32 failed!! Error: %i\n", mLastResult);
|
|
return mLastResult;
|
|
}
|
|
|
|
// synchronize to avoid race conditions.
|
|
// https://docs.nvidia.com/cuda/cuda-driver-api/api-sync-behavior.html#api-sync-behavior__memset
|
|
mLastResult = cuStreamSynchronize(0);
|
|
if (mLastResult != CUDA_SUCCESS)
|
|
{
|
|
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "memsetD32 failed!! Error: %i\n", mLastResult);
|
|
}
|
|
}
|
|
return mLastResult;
|
|
}
|
|
|
|
PxCUresult CudaCtx::memsetD16(CUdeviceptr dstDevice, unsigned short uh, size_t N)
|
|
{
|
|
if (mIsInAbortMode)
|
|
return mLastResult;
|
|
|
|
if (N > 0)
|
|
{
|
|
cuMemsetD16(dstDevice, uh, N);
|
|
// synchronize to avoid race conditions.
|
|
// https://docs.nvidia.com/cuda/cuda-driver-api/api-sync-behavior.html#api-sync-behavior__memset
|
|
mLastResult = cuStreamSynchronize(0);
|
|
if (mLastResult != CUDA_SUCCESS)
|
|
{
|
|
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "memsetD16 failed!! Error: %i\n", mLastResult);
|
|
}
|
|
}
|
|
return mLastResult;
|
|
}
|
|
|
|
PxCUresult CudaCtx::memsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N)
|
|
{
|
|
if (mIsInAbortMode)
|
|
return mLastResult;
|
|
|
|
if (N > 0)
|
|
{
|
|
cuMemsetD8(dstDevice, uc, N);
|
|
// synchronize to avoid race conditions.
|
|
// https://docs.nvidia.com/cuda/cuda-driver-api/api-sync-behavior.html#api-sync-behavior__memset
|
|
mLastResult = cuStreamSynchronize(0);
|
|
if (mLastResult != CUDA_SUCCESS)
|
|
{
|
|
PxGetFoundation().error(PxErrorCode::eINTERNAL_ERROR, PX_FL, "memsetD8 failed!! Error: %i\n", mLastResult);
|
|
}
|
|
}
|
|
return mLastResult;
|
|
}
|
|
|
|
void CudaCtx::setAbortMode(bool abort)
|
|
{
|
|
mIsInAbortMode = abort;
|
|
|
|
if ((abort == false) && (mLastResult == CUDA_ERROR_OUT_OF_MEMORY))
|
|
{
|
|
mLastResult = CUDA_SUCCESS;
|
|
}
|
|
}
|
|
|
|
PxCudaContext* createCudaContext(CUdevice device, PxDeviceAllocatorCallback* callback, bool launchSynchronous)
|
|
{
|
|
PX_UNUSED(device);
|
|
return PX_NEW(CudaCtx)(callback, launchSynchronous);
|
|
}
|
|
|
|
#if PX_SUPPORT_GPU_PHYSX
|
|
|
|
PxCudaContextManager* createCudaContextManager(const PxCudaContextManagerDesc& desc, PxErrorCallback& errorCallback, bool launchSynchronous)
|
|
{
|
|
return PX_NEW(CudaCtxMgr)(desc, errorCallback, launchSynchronous);
|
|
}
|
|
|
|
#endif
|
|
|
|
} // end physx namespace
|
|
|
|
|