3 #include <cuda_runtime.h>    12 #define CUDA_SUPPORTED_FUNC __device__ __host__    16 #define CUBOTH __device__ __host__    23     cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte );
    24     if ( cudaSuccess != cuda_status ){
    25         dbgprintf(
"Error: cudaMemGetInfo fails, %s \n", cudaGetErrorString(cuda_status) );
    28     double free_db = (double)free_byte;
    29     double total_db = (double)total_byte;
    30     double used_db = total_db - free_db;
    31     dbgprintf(
"%sused = %2.2f MB, free = %2.2f MB, total = %2.2f MB\n",
    32         info != 
"" ? (info+
": ").c_str() : 
"",
    33         used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
    38     if (err != CUFFT_SUCCESS) {
    40         throw std::runtime_error(
SPrintf(
"CUDA error: CUFFT failed (%d)\n",err));
    46     if (err != cudaSuccess) {
    47         const char* errstr = cudaGetErrorString(err);
    48         throw std::runtime_error(
SPrintf(
"CUDA error: %s\n" ,errstr).c_str());
    54     cudaError_t err = cudaGetLastError();
    55     if (err != cudaSuccess) {
    56         const char* errstr = cudaGetErrorString(err);
    98             if (cudaMalloc(&
data, 
sizeof(T)*s) != cudaSuccess) {
    99                 throw std::bad_alloc(
SPrintf(
"device_vec<%s> init %d elements failed", 
typeid(T).name(), s).c_str());
   110     operator std::vector<T>() 
const {
   111         std::vector<T> dst(
size);
   132     void copyToHost(std::vector<T>& dst ,
bool async, cudaStream_t s=0) {
   133         if (dst.size() != 
size)
   137     void copyToDevice(
const std::vector<T>& src, 
bool async=
false, cudaStream_t s=0) {
   141         if (this->size < size)
   150         std::vector<T> v (
size);
   161 #if 1 //defined(_DEBUG)   166         QueryPerformanceCounter((LARGE_INTEGER*)&time);
   167         QueryPerformanceFrequency((LARGE_INTEGER*)&freq);
   172         QueryPerformanceCounter((LARGE_INTEGER*)&time1);
   173         double dt = (double)(time1-time) / (double)freq;
   174         dbgprintf(
"%s: Time taken: %f ms\n", name, dt*1000);
   184 template<
typename T, 
int flags=0>
   197     template<
typename TOther, 
int f>
   200         for(
int k=0;k<src.
n;k++)
   203     template<
typename TOther, 
int F>
   205         if (src.
n != n) 
init(src.
n);
   206         for(
int k=0;k<src.
n;k++)
   210     template<
typename Iterator>
   212         d=0; 
init(end-first);
   213         for (
int k = 0; first != end; ++first) {
   225     const T* 
begin()
 const { 
return d; }
   226     const T* 
end()
 const { 
return d+n; }
   235         if (cudaMallocHost(&d, 
sizeof(T)*n, flags) != cudaSuccess) {
   236             throw std::bad_alloc(
SPrintf(
"%s init %d elements failed", 
typeid(*this).name(), n).c_str());
   250     cudaDeviceSynchronize();
   251     std::vector<float2> x(src.
size);
   253     dst.resize(src.
size);
   254     for(
int i=0;i<x.size();i++)
   255         dst[i]=std::complex<float>(x[i].x,x[i].y);
   258     cudaDeviceSynchronize();
   262     std::vector<float> dbg_output(src.
size);
 
device_vec(const device_vec< T > &src)
 
void copyToHost(T *dst, bool async, cudaStream_t s=0)
 
void WriteVectorAsCSVRow(const char *file, std::vector< float > d, bool append)
 
pinned_array(const device_vec< T > &src)
 
MeasureTime(const char *name)
 
void DbgOutputVectorToFile(std::string loc, device_vec< float > &src, bool append)
 
void copyToHost(std::vector< T > &dst, bool async, cudaStream_t s=0)
 
device_vec(const std::vector< T > &src)
 
pinned_array(const pinned_array< TOther, f > &src)
 
pinned_array & operator=(const pinned_array< TOther, F > &src)
 
void CheckCUDAError(cufftResult_t err)
 
void copyToDevice(const std::vector< T > &src, bool async=false, cudaStream_t s=0)
 
void outputTotalGPUMemUse(std::string info="")
 
std::vector< T > toVector()
 
pinned_array(Iterator first, Iterator end)
 
void dbgCUDAErrorCheck(cudaError_t e)
 
const T & operator[](int i) const
 
device_vec< T > & operator=(const std::vector< T > &src)
 
void copyToDevice(const T *first, size_t size, bool async=false, cudaStream_t s=0)
 
void dbgprintf(const char *fmt,...)
 
void DbgCopyResult(device_vec< float2 > src, std::vector< std::complex< float > > &dst)
 
device_vec< T > & operator=(const device_vec< T > &src)
 
std::string SPrintf(const char *fmt,...)