3 #include <cuda_runtime.h> 12 #define CUDA_SUPPORTED_FUNC __device__ __host__ 16 #define CUBOTH __device__ __host__ 23 cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte );
24 if ( cudaSuccess != cuda_status ){
25 dbgprintf(
"Error: cudaMemGetInfo fails, %s \n", cudaGetErrorString(cuda_status) );
28 double free_db = (double)free_byte;
29 double total_db = (double)total_byte;
30 double used_db = total_db - free_db;
31 dbgprintf(
"%sused = %2.2f MB, free = %2.2f MB, total = %2.2f MB\n",
32 info !=
"" ? (info+
": ").c_str() :
"",
33 used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
38 if (err != CUFFT_SUCCESS) {
40 throw std::runtime_error(
SPrintf(
"CUDA error: CUFFT failed (%d)\n",err));
46 if (err != cudaSuccess) {
47 const char* errstr = cudaGetErrorString(err);
48 throw std::runtime_error(
SPrintf(
"CUDA error: %s\n" ,errstr).c_str());
54 cudaError_t err = cudaGetLastError();
55 if (err != cudaSuccess) {
56 const char* errstr = cudaGetErrorString(err);
98 if (cudaMalloc(&
data,
sizeof(T)*s) != cudaSuccess) {
99 throw std::bad_alloc(
SPrintf(
"device_vec<%s> init %d elements failed",
typeid(T).name(), s).c_str());
110 operator std::vector<T>()
const {
111 std::vector<T> dst(
size);
132 void copyToHost(std::vector<T>& dst ,
bool async, cudaStream_t s=0) {
133 if (dst.size() !=
size)
137 void copyToDevice(
const std::vector<T>& src,
bool async=
false, cudaStream_t s=0) {
141 if (this->size < size)
150 std::vector<T> v (
size);
161 #if 1 //defined(_DEBUG) 166 QueryPerformanceCounter((LARGE_INTEGER*)&time);
167 QueryPerformanceFrequency((LARGE_INTEGER*)&freq);
172 QueryPerformanceCounter((LARGE_INTEGER*)&time1);
173 double dt = (double)(time1-time) / (double)freq;
174 dbgprintf(
"%s: Time taken: %f ms\n", name, dt*1000);
184 template<
typename T,
int flags=0>
197 template<
typename TOther,
int f>
200 for(
int k=0;k<src.
n;k++)
203 template<
typename TOther,
int F>
205 if (src.
n != n)
init(src.
n);
206 for(
int k=0;k<src.
n;k++)
210 template<
typename Iterator>
212 d=0;
init(end-first);
213 for (
int k = 0; first != end; ++first) {
225 const T*
begin()
const {
return d; }
226 const T*
end()
const {
return d+n; }
235 if (cudaMallocHost(&d,
sizeof(T)*n, flags) != cudaSuccess) {
236 throw std::bad_alloc(
SPrintf(
"%s init %d elements failed",
typeid(*this).name(), n).c_str());
250 cudaDeviceSynchronize();
251 std::vector<float2> x(src.
size);
253 dst.resize(src.
size);
254 for(
int i=0;i<x.size();i++)
255 dst[i]=std::complex<float>(x[i].x,x[i].y);
258 cudaDeviceSynchronize();
262 std::vector<float> dbg_output(src.
size);
device_vec(const device_vec< T > &src)
void copyToHost(T *dst, bool async, cudaStream_t s=0)
void WriteVectorAsCSVRow(const char *file, std::vector< float > d, bool append)
pinned_array(const device_vec< T > &src)
MeasureTime(const char *name)
void DbgOutputVectorToFile(std::string loc, device_vec< float > &src, bool append)
void copyToHost(std::vector< T > &dst, bool async, cudaStream_t s=0)
device_vec(const std::vector< T > &src)
pinned_array(const pinned_array< TOther, f > &src)
pinned_array & operator=(const pinned_array< TOther, F > &src)
void CheckCUDAError(cufftResult_t err)
void copyToDevice(const std::vector< T > &src, bool async=false, cudaStream_t s=0)
void outputTotalGPUMemUse(std::string info="")
std::vector< T > toVector()
pinned_array(Iterator first, Iterator end)
void dbgCUDAErrorCheck(cudaError_t e)
const T & operator[](int i) const
device_vec< T > & operator=(const std::vector< T > &src)
void copyToDevice(const T *first, size_t size, bool async=false, cudaStream_t s=0)
void dbgprintf(const char *fmt,...)
void DbgCopyResult(device_vec< float2 > src, std::vector< std::complex< float > > &dst)
device_vec< T > & operator=(const device_vec< T > &src)
std::string SPrintf(const char *fmt,...)