QTrk
gpu_utils.h
Go to the documentation of this file.
1 #pragma once
2 // Simple device side vector implementation.
3 #include <cuda_runtime.h>
4 #include <vector>
5 #include <cstdarg>
6 #include "cufft.h"
7 
8 #ifdef _DEBUG
9 #define GPU_DEBUG
10 #endif
11 
12 #define CUDA_SUPPORTED_FUNC __device__ __host__
13 #include "LsqQuadraticFit.h"
14 
16 #define CUBOTH __device__ __host__
17 
18 inline void outputTotalGPUMemUse(std::string info = "")
19 {
20  // show total memory usage of GPU
21  size_t free_byte;
22  size_t total_byte;
23  cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte );
24  if ( cudaSuccess != cuda_status ){
25  dbgprintf("Error: cudaMemGetInfo fails, %s \n", cudaGetErrorString(cuda_status) );
26  exit(1);
27  }
28  double free_db = (double)free_byte;
29  double total_db = (double)total_byte;
30  double used_db = total_db - free_db;
31  dbgprintf("%sused = %2.2f MB, free = %2.2f MB, total = %2.2f MB\n",
32  info != "" ? (info+": ").c_str() : "",
33  used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
34 }
35 
36 inline void CheckCUDAError(cufftResult_t err)
37 {
38  if (err != CUFFT_SUCCESS) {
39  outputTotalGPUMemUse("CUFFT Error");
40  throw std::runtime_error(SPrintf("CUDA error: CUFFT failed (%d)\n",err));
41  }
42 }
43 
44 inline void CheckCUDAError(cudaError_t err)
45 {
46  if (err != cudaSuccess) {
47  const char* errstr = cudaGetErrorString(err);
48  throw std::runtime_error(SPrintf("CUDA error: %s\n" ,errstr).c_str());
49  }
50 }
51 
52 inline void CheckCUDAError()
53 {
54  cudaError_t err = cudaGetLastError();
55  if (err != cudaSuccess) {
56  const char* errstr = cudaGetErrorString(err);
57  dbgprintf("CUDA error: %s\n" ,errstr);
58  }
59 }
60 #ifdef _DEBUG
61 inline void dbgCUDAErrorCheck(cudaError_t e) { CheckCUDAError(e); }
62 #else
63 inline void dbgCUDAErrorCheck(cudaError_t e) {}
64 #endif
65 
67 template<typename T>
68 class device_vec {
69 public:
71  data = 0;
72  size = 0;
73  }
74 
75  device_vec(size_t N) {
76  data = 0;
77  size = 0;
78  init(N);
79  }
80  device_vec(const device_vec<T>& src) {
81  data = 0; size = 0;
82  init(src.size);
83  dbgCUDAErrorCheck(cudaMemcpy(data, src.data, sizeof(T)*size, cudaMemcpyDeviceToDevice));
84  }
85  device_vec(const std::vector<T>& src) {
86  data=0; size=0;
87  init(src.size());
88  dbgCUDAErrorCheck(cudaMemcpy(data, &src[0], sizeof(T)*size, cudaMemcpyHostToDevice));
89  }
91  free();
92  }
93  void init(size_t s) {
94  if(size != s) {
95  free();
96  }
97  if (s!=0) {
98  if (cudaMalloc(&data, sizeof(T)*s) != cudaSuccess) {
99  throw std::bad_alloc(SPrintf("device_vec<%s> init %d elements failed", typeid(T).name(), s).c_str());
100  }
101  size = s;
102  }
103  }
104  void free() {
105  if (data) {
106  dbgCUDAErrorCheck(cudaFree(data));
107  data=0;
108  }
109  }
110  operator std::vector<T>() const {
111  std::vector<T> dst(size);
112  dbgCUDAErrorCheck(cudaMemcpy(&dst[0], data, sizeof(T)*size, cudaMemcpyDeviceToHost));
113  return dst;
114  }
115  device_vec<T>& operator=(const std::vector<T>& src) {
116  init(src.size());
117  dbgCUDAErrorCheck(cudaMemcpy(data, &src[0], sizeof(T)*size, cudaMemcpyHostToDevice));
118  return *this;
119  }
121  clear();
122  init(src.size);
123  dbgCUDAErrorCheck(cudaMemcpy(data, src.data, sizeof(T)*size, cudaMemcpyDeviceToDevice));
124  return *this;
125  }
126  void copyToHost(T* dst, bool async, cudaStream_t s=0) {
127  if (async)
128  dbgCUDAErrorCheck(cudaMemcpyAsync(dst, data, sizeof(T) * size, cudaMemcpyDeviceToHost, s));
129  else
130  dbgCUDAErrorCheck(cudaMemcpy(dst, data, sizeof(T) * size, cudaMemcpyDeviceToHost));
131  }
132  void copyToHost(std::vector<T>& dst ,bool async, cudaStream_t s=0) {
133  if (dst.size() != size)
134  dst.resize(size);
135  copyToHost(&dst[0], async, s);
136  }
137  void copyToDevice(const std::vector<T>& src, bool async=false, cudaStream_t s=0) {
138  copyToDevice(&src[0], src.size(), async, s);
139  }
140  void copyToDevice(const T* first, size_t size, bool async=false, cudaStream_t s=0) {
141  if (this->size < size)
142  init(size);
143  if (async)
144  dbgCUDAErrorCheck(cudaMemcpyAsync(data, first, sizeof(T) * size, cudaMemcpyHostToDevice, s));
145  else
146  dbgCUDAErrorCheck(cudaMemcpy(data, first, sizeof(T) * size, cudaMemcpyHostToDevice));
147  }
148  // debugging util. Be sure to synchronize before
149  std::vector<T> toVector() {
150  std::vector<T> v (size);
151  dbgCUDAErrorCheck(cudaMemcpy(&v[0], data, sizeof(T)*size, cudaMemcpyDeviceToHost));
152  return v;
153  }
154  size_t memsize() { return size*sizeof(T); }
155  size_t size;
156  T* data;
157 };
158 
159 
160 
161 #if 1 //defined(_DEBUG)
162 struct MeasureTime {
163  uint64_t freq, time;
164  const char* name;
165  MeasureTime(const char *name) {
166  QueryPerformanceCounter((LARGE_INTEGER*)&time);
167  QueryPerformanceFrequency((LARGE_INTEGER*)&freq);
168  this->name=name;
169  }
171  uint64_t time1;
172  QueryPerformanceCounter((LARGE_INTEGER*)&time1);
173  double dt = (double)(time1-time) / (double)freq;
174  dbgprintf("%s: Time taken: %f ms\n", name, dt*1000);
175  }
176 };
177 #else
178 struct MeasureTime {
179  MeasureTime(const char* name) {}
180 };
181 #endif
182 
183 
184 template<typename T, int flags=0>
186 {
187 public:
189  d=0; n=0;
190  }
192  free();
193  }
194  pinned_array(size_t n) {
195  d=0; init(n);
196  }
197  template<typename TOther, int f>
199  d=0;init(src.n);
200  for(int k=0;k<src.n;k++)
201  d[k]=src[k];
202  }
203  template<typename TOther, int F>
205  if (src.n != n) init(src.n);
206  for(int k=0;k<src.n;k++)
207  d[k]=src[k];
208  return *this;
209  }
210  template<typename Iterator>
211  pinned_array(Iterator first, Iterator end) {
212  d=0; init(end-first);
213  for (int k = 0; first != end; ++first) {
214  d[k++] = *first;
215  }
216  }
217  template<typename T>
219  d=0; init(src.size()); src.copyToHost(d,false);
220  }
221 
222  int size() const { return n; }
223  T* begin() { return d; }
224  T* end() { return d+n; }
225  const T* begin() const { return d; }
226  const T* end() const { return d+n; }
227  T* data() { return d; }
228  void free() {
229  cudaFreeHost(d);
230  d=0;n=0;
231  }
232  void init(int n) {
233  if (d) free();
234  this->n = n;
235  if (cudaMallocHost(&d, sizeof(T)*n, flags) != cudaSuccess) {
236  throw std::bad_alloc(SPrintf("%s init %d elements failed", typeid(*this).name(), n).c_str());
237  }
238  }
239  T& operator[](int i) { return d[i]; }
240  const T&operator[](int i) const { return d[i];}
241  size_t memsize() { return n*sizeof(T); }
242 
243 protected:
244  T* d;
245  size_t n;
246 };
247 
248 #ifdef GPU_DEBUG
249 inline void DbgCopyResult(device_vec<float2>& src, std::vector< std::complex<float> >& dst) {
250  cudaDeviceSynchronize();
251  std::vector<float2> x(src.size);
252  src.copyToHost(x,false,0);
253  dst.resize(src.size);
254  for(int i=0;i<x.size();i++)
255  dst[i]=std::complex<float>(x[i].x,x[i].y);
256 }
257 inline void DbgCopyResult(device_vec<float>& src, std::vector< float >& dst) {
258  cudaDeviceSynchronize();
259  src.copyToHost(dst,false,0);
260 }
261 inline void DbgOutputVectorToFile(std::string loc, device_vec<float>& src, bool append = true) {
262  std::vector<float> dbg_output(src.size);
263  DbgCopyResult(src, dbg_output);
264  WriteVectorAsCSVRow(loc.c_str(), dbg_output, append);
265  dbg_output.clear();
266 }
267 #else
268 inline void DbgCopyResult(device_vec<float2> src, std::vector< std::complex<float> >& dst) {}
269 inline void DbgCopyResult(device_vec<float> src, std::vector<float>& dst) {}
270 inline void DbgOutputVectorToFile(std::string loc, device_vec<float>& src, bool append) {}
271 #endif
pinned_array(size_t n)
Definition: gpu_utils.h:194
T * data()
Definition: gpu_utils.h:227
int size() const
Definition: gpu_utils.h:222
device_vec(const device_vec< T > &src)
Definition: gpu_utils.h:80
void copyToHost(T *dst, bool async, cudaStream_t s=0)
Definition: gpu_utils.h:126
void free()
Definition: gpu_utils.h:104
void free()
Definition: gpu_utils.h:228
void WriteVectorAsCSVRow(const char *file, std::vector< float > d, bool append)
Definition: utils.cpp:523
pinned_array(const device_vec< T > &src)
Definition: gpu_utils.h:218
void init(size_t s)
Definition: gpu_utils.h:93
MeasureTime(const char *name)
Definition: gpu_utils.h:165
void DbgOutputVectorToFile(std::string loc, device_vec< float > &src, bool append)
Definition: gpu_utils.h:270
void copyToHost(std::vector< T > &dst, bool async, cudaStream_t s=0)
Definition: gpu_utils.h:132
device_vec()
Definition: gpu_utils.h:70
size_t memsize()
Definition: gpu_utils.h:241
device_vec(const std::vector< T > &src)
Definition: gpu_utils.h:85
pinned_array(const pinned_array< TOther, f > &src)
Definition: gpu_utils.h:198
pinned_array & operator=(const pinned_array< TOther, F > &src)
Definition: gpu_utils.h:204
void CheckCUDAError(cufftResult_t err)
Definition: gpu_utils.h:36
T & operator[](int i)
Definition: gpu_utils.h:239
T * begin()
Definition: gpu_utils.h:223
void copyToDevice(const std::vector< T > &src, bool async=false, cudaStream_t s=0)
Definition: gpu_utils.h:137
void outputTotalGPUMemUse(std::string info="")
Definition: gpu_utils.h:18
uint64_t time
Definition: gpu_utils.h:163
std::vector< T > toVector()
Definition: gpu_utils.h:149
size_t size
Definition: gpu_utils.h:155
pinned_array(Iterator first, Iterator end)
Definition: gpu_utils.h:211
void dbgCUDAErrorCheck(cudaError_t e)
Definition: gpu_utils.h:63
const T & operator[](int i) const
Definition: gpu_utils.h:240
device_vec< T > & operator=(const std::vector< T > &src)
Definition: gpu_utils.h:115
void copyToDevice(const T *first, size_t size, bool async=false, cudaStream_t s=0)
Definition: gpu_utils.h:140
const T * end() const
Definition: gpu_utils.h:226
void init(int n)
Definition: gpu_utils.h:232
size_t n
Definition: gpu_utils.h:245
void dbgprintf(const char *fmt,...)
Definition: utils.cpp:149
void DbgCopyResult(device_vec< float2 > src, std::vector< std::complex< float > > &dst)
Definition: gpu_utils.h:268
size_t memsize()
Definition: gpu_utils.h:154
device_vec< T > & operator=(const device_vec< T > &src)
Definition: gpu_utils.h:120
~device_vec()
Definition: gpu_utils.h:90
const char * name
Definition: gpu_utils.h:164
const T * begin() const
Definition: gpu_utils.h:225
device_vec(size_t N)
Definition: gpu_utils.h:75
std::string SPrintf(const char *fmt,...)
Definition: utils.cpp:132