QTrk
cudaImageList.h
Go to the documentation of this file.
1 #pragma once
2 
3 #include "gpu_utils.h"
4 
9 template<typename T>
10 struct cudaImageList {
11  T* data;
12  size_t pitch;
13  int w,h;
14  int count;
15 
16  CUBOTH int fullwidth() { return w; }
17  CUBOTH int fullheight() { return h*count; }
18 
19  enum { MaxImageWidth = 8192 };
20 
21  CUBOTH int capacity() { return count; }
22  CUBOTH int numpixels() { return w*h*count; }
23 
25  cudaImageList imgl;
26  imgl.data = 0;
27  imgl.pitch = 0;
28  imgl.w = imgl.h = 0;
29  imgl.count = 0;
30  return imgl;
31  }
32 
33  CUBOTH bool isEmpty() { return data==0; }
34 
35  static cudaImageList<T> alloc(int w, int h, int amount) {
36  cudaImageList imgl;
37  imgl.w = w; imgl.h = h;
38  imgl.count = amount;
39 
40  if (cudaMallocPitch(&imgl.data, &imgl.pitch, sizeof(T)*imgl.fullwidth(), imgl.fullheight()) != cudaSuccess) {
41  throw std::bad_alloc(SPrintf("cudaImageListf<%s> alloc %dx%dx%d failed", typeid(T).name(), w, h, amount).c_str());
42  }
43  return imgl;
44  }
45 
46  template<int Flags>
48  hostImgBuf.init( numpixels() );
49  }
50 
51  CUBOTH T* get(int i) {
52  return (T*)(((char*)data) + pitch*h*i);
53  }
54 
55  CUBOTH T pixel_oobcheck(int x,int y, int imgIndex, T border=0.0f) {
56  if (x < 0 || x >= w || y < 0 || y >= h)
57  return border;
58 
59  computeImagePos(x,y,imgIndex);
60  T* row = (T*) ( (char*)data + y*pitch );
61  return row[x];
62  }
63 
64  CUBOTH T& pixel(int x,int y, int imgIndex) {
65  computeImagePos(x,y,imgIndex);
66  T* row = (T*) ( (char*)data + y*pitch );
67  return row[x];
68  }
69 
71  CUBOTH T* pixelAddress(int x,int y, int imgIndex) {
72  computeImagePos(x,y,imgIndex);
73  T* row = (T*) ( (char*)data + y*pitch );
74  return row + x;
75  }
76 
77 
78  // Returns true if bounds are crossed
79  CUBOTH bool boundaryHit(float2 center, float radius)
80  {
81  return center.x + radius >= w ||
82  center.x - radius < 0 ||
83  center.y + radius >= h ||
84  center.y - radius < 0;
85  }
86 
87 
88  void free()
89  {
90  if(data) {
91  cudaFree(data);
92  data=0;
93  }
94  }
95 
96  // Copy a single subimage to the host
97  void copyImageToHost(int img, T* dst, bool async=false, cudaStream_t s=0) {
98  T* src = pixelAddress (0,0, img);
99 
100  if (async)
101  cudaMemcpy2DAsync(dst, sizeof(T)*w, src, pitch, w*sizeof(T), h, cudaMemcpyDeviceToHost, s);
102  else
103  cudaMemcpy2D(dst, sizeof(T)*w, src, pitch, w*sizeof(T), h, cudaMemcpyDeviceToHost);
104  }
105  // Copy a single subimage to the device
106  void copyImageToDevice(int img, T* src, bool async=false, cudaStream_t s=0) {
107  T* dst = pixelAddress (0,0, img);
108 
109  if (async)
110  cudaMemcpy2DAsync(dst, pitch, src, w*sizeof(T), w*sizeof(T), h, cudaMemcpyHostToDevice, s);
111  else
112  cudaMemcpy2D(dst, pitch, src, w*sizeof(T), w*sizeof(T), h, cudaMemcpyHostToDevice);
113  }
114 
115  void copyToHost(T* dst, bool async=false, cudaStream_t s=0) {
116  if (async)
117  cudaMemcpy2DAsync(dst, sizeof(T)*w, data, pitch, w*sizeof(T), count*h, cudaMemcpyDeviceToHost, s);
118  else
119  cudaMemcpy2D(dst, sizeof(T)*w, data, pitch, w*sizeof(T), count*h, cudaMemcpyDeviceToHost);
120  }
121 
122  void copyToDevice(T* src, bool async=false, cudaStream_t s=0) {
123  if (async)
124  cudaMemcpy2DAsync(data, pitch, src, w*sizeof(T), w*sizeof(T), count*h, cudaMemcpyHostToDevice, s);
125  else
126  cudaMemcpy2D(data, pitch, src, w*sizeof(T), w*sizeof(T), count*h, cudaMemcpyHostToDevice);
127  }
128 
129  void copyToDevice(T* src, int numImages, bool async=false, cudaStream_t s=0) {
130  if (async)
131  cudaMemcpy2DAsync(data, pitch, src, w*sizeof(T), w*sizeof(T), numImages*h, cudaMemcpyHostToDevice, s);
132  else
133  cudaMemcpy2D(data, pitch, src, w*sizeof(T), w*sizeof(T), numImages*h, cudaMemcpyHostToDevice);
134  }
135 
136  void clear() {
137  if(data) cudaMemset2D(data, pitch, 0, w*sizeof(T), count*h);
138  }
139 
140  CUBOTH int totalNumPixels() { return w*h*count; }
141  CUBOTH int totalNumBytes() { return w*h*count*sizeof(T); }
142 
143  CUBOTH static inline T interp(T a, T b, float x) { return a + (b-a)*x; }
144 
145  CUBOTH T interpolate(float x,float y, int idx, bool &outside)
146  {
147  int rx=x, ry=y;
148 
149  if (rx < 0 || ry < 0 || rx >= w-1 || ry >= h-1) {
150  outside=true;
151  return 0.0f;
152  }
153 
154  T v00 = pixel(rx, ry, idx);
155  T v10 = pixel(rx+1, ry, idx);
156  T v01 = pixel(rx, ry+1, idx);
157  T v11 = pixel(rx+1, ry+1, idx);
158 
159  T v0 = interp (v00, v10, x-rx);
160  T v1 = interp (v01, v11, x-rx);
161 
162  outside=false;
163  return interp (v0, v1, y-ry);
164  }
165 
166  void bind(texture<T, cudaTextureType2D, cudaReadModeElementType>& texref) {
167  cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
168  cudaBindTexture2D(NULL, &texref, data, &desc, w, h*count, pitch);
169  }
170  void unbind(texture<T, cudaTextureType2D, cudaReadModeElementType>& texref) {
171  cudaUnbindTexture(&texref);
172  }
173 
174  CUBOTH void computeImagePos(int& x, int& y, int idx)
175  {
176  y += idx * h;
177  }
178 
179  // Using the texture cache can result in significant speedups
180  __device__ T interpolateFromTexture(texture<T, cudaTextureType2D, cudaReadModeElementType> texref, float x,float y, int idx, bool& outside)
181  {
182  int rx=x, ry=y;
183 
184  if (rx < 0 || ry < 0 || rx >= w-1 || ry >= h-1) {
185  outside=true;
186  return 0.0f;
187  }
188 
189  computeImagePos(rx, ry, idx);
190 
191  float fx=x-floor(x), fy = y-floor(y);
192  float u = rx + 0.5f;
193  float v = ry + 0.5f;
194 
195  T v00 = tex2D(texref, u, v);
196  T v10 = tex2D(texref, u+1, v);
197  T v01 = tex2D(texref, u, v+1);
198  T v11 = tex2D(texref, u+1, v+1);
199 
200  T v0 = interp (v00, v10, fx);
201  T v1 = interp (v01, v11, fx);
202 
203  outside = false;
204  return interp (v0, v1, fy);
205  }
206 };
207 
208 
209 
210 // 4D image, implemented by having layers with each a grid of 2D images.
211 template<typename T>
213 {
214  cudaArray_t array;
215  int imgw, imgh;
216  int layerw, layerh; // layer width/height in images. (In pixels it would be layerw * imgw)
217  int nlayers;
218  int numImg; // images per layer
219 
220  // CUDA 3D arrays use width in elements, linear memory should use width in bytes.
221  // http://stackoverflow.com/questions/10611451/how-to-use-make-cudaextent-to-define-a-cudaextent-correctly
222  cudaExtent getExtent() {
223  return make_cudaExtent(imgw * layerw, imgh * layerh, nlayers);
224  }
225 
226  // Properties to be passed to kernels
227  struct KernelInst {
228  int imgw, imgh;
229  int layerw;
230 
231  CUBOTH int2 getImagePos(int image)
232  {
233  return make_int2(imgw * (image % layerw), imgh * (image / layerw));
234  }
235 
236  __device__ T readSurfacePixel(surface<void, cudaSurfaceType2DLayered> surf, int x, int y,int z)
237  {
238  T r;
239  surf2DLayeredread (&r, image_lut_surface, sizeof(T)*x, y, z, cudaBoundaryModeTrap);
240  return r;
241  }
242 
243  __device__ void writeSurfacePixel(surface<void, cudaSurfaceType2DLayered> surf, int x,int y,int z, T value)
244  {
245  surf2DLayeredwrite(value, surf, sizeof(T)*x, y, z, cudaBoundaryModeTrap);
246  }
247  };
248 
250  KernelInst inst;
251  inst.imgw = imgw; inst.imgh = imgh;
252  inst.layerw = layerw;
253  return inst;
254  }
255 
256  void bind(texture<T, cudaTextureType2DLayered, cudaReadModeElementType>& texref) {
257  cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
258  CheckCUDAError( cudaBindTextureToArray(texref, array, &desc) );
259  }
260  void unbind(texture<T, cudaTextureType2DLayered, cudaReadModeElementType>& texref) {
261  cudaUnbindTexture(texref);
262  }
263 
264  void bind(surface<void, cudaSurfaceType2DLayered>& surf) {
265  cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
266  CheckCUDAError( cudaBindSurfaceToArray(surf, array) );
267  }
268  // there is no unbind surface
269  // void unbind(surface<void, cudaSurfaceType2DLayered>& surf) { }
270 
271  Image4DCudaArray(int sx, int sy, int numImg , int sL) {
272  array = 0;
273  int d;
274  cudaGetDevice(&d);
275  cudaDeviceProp prop;
276  cudaGetDeviceProperties(&prop, d);
277 
278  imgw = sx;
279  imgh = sy;
280  this->numImg = numImg;
281 
282 // layerh = (int)(prop.maxSurface2DLayered[1] / imgh);
283  layerh = 2048 / imgh;
284  layerw = (numImg + layerh - 1) / layerh;
285  nlayers = sL;
286 
287  dbgprintf("creating image4D: %d layers of %d x %d images of %d x %d (%dx%dx%d)\n",
288  sL, layerw, layerh, imgw, imgh, getExtent().width,getExtent().height,getExtent().depth);
289 
290  cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
291  cudaError_t err = cudaMalloc3DArray(&array, &desc, getExtent(), cudaArrayLayered | cudaArraySurfaceLoadStore);
292  //cudaError_t err = cudaMalloc3DArray(&array, &desc, getExtent(), cudaArraySurfaceLoadStore);
293  if (err != cudaSuccess) {
294  throw std::bad_alloc(SPrintf("CUDA error during cudaSurf2DList(): %s", cudaGetErrorString(err)).c_str());
295  }
296  }
297 
298  int2 getImagePos(int image) {
299  int2 pos = { imgw * ( image % layerw ), imgh * ( image / layerw ) };
300  return pos;
301  }
302 
304  free();
305  }
306 
307  void copyToDevice(T* src, bool async=false, cudaStream_t s=0)
308  {
309  for (int L=0;L<nlayers;L++) {
310  for (int i=0;i<numImg;i++)
311  copyImageToDevice(i, L, &src[ imgw * imgh * ( numImg * L + i ) ], async, s);
312  }
313  }
314 
315  void copyToHost(T* dst, bool async=false, cudaStream_t s=0)
316  {
317  for (int L=0;L<nlayers;L++) {
318  for (int i=0;i<numImg;i++)
319  copyImageToHost(i, L, &dst[ imgw * imgh * ( numImg * L + i ) ], async, s);
320  }
321  }
322 
323  void clear()
324  {
325  // create a new black image in device memory and use to it clear all the layers
326  T* d;
327  size_t srcpitch;
328  CheckCUDAError( cudaMallocPitch(&d, &srcpitch, sizeof(T)*imgw, imgh) );
329  CheckCUDAError( cudaMemset2D(d, srcpitch, 0, sizeof(T)*imgw, imgh) );
330 
331  cudaMemcpy3DParms p = {0};
332  p.dstArray = array;
333  p.extent = make_cudaExtent(imgw,imgh,1);
334  p.kind = cudaMemcpyDeviceToDevice;
335  p.srcPtr = make_cudaPitchedPtr(d, srcpitch, sizeof(T)*imgw, imgh);
336  for (int l=0;l<nlayers;l++)
337  for (int img=0;img<numImg;img++) {
338  int2 imgpos = getImagePos(img);
339  p.dstPos.z = l;
340  p.dstPos.x = imgpos.x;
341  p.dstPos.y = imgpos.y;
342  CheckCUDAError( cudaMemcpy3D(&p) );
343  }
344  CheckCUDAError( cudaFree(d) );
345  }
346 
347  // Copy a single subimage to the host
348  void copyImageToHost(int img, int layer, T* dst, bool async=false, cudaStream_t s=0)
349  {
350  // According to CUDA docs:
351  // The extent field defines the dimensions of the transferred area in elements.
352  // If a CUDA array is participating in the copy, the extent is defined in terms of that array's elements.
353  // If no CUDA array is participating in the copy then the extents are defined in elements of unsigned char.
354 
355  cudaMemcpy3DParms p = {0};
356  p.srcArray = array;
357  p.extent = make_cudaExtent(imgw,imgh,1);
358  p.kind = cudaMemcpyDeviceToHost;
359  p.srcPos.z = layer;
360  int2 imgpos = getImagePos(img);
361  p.srcPos.x = imgpos.x;
362  p.srcPos.y = imgpos.y;
363  p.dstPtr = make_cudaPitchedPtr(dst, sizeof(T)*imgw, sizeof(T)*imgw, imgh);
364  if (async)
365  CheckCUDAError( cudaMemcpy3DAsync(&p, s) );
366  else
367  CheckCUDAError( cudaMemcpy3D(&p) );
368  }
369 
370  void copyImageToDevice(int img, int layer, T* src, bool async=false, cudaStream_t s=0)
371  {
372  // Memcpy3D needs the right pitch for the source, so we first need to copy it to 2D pitched memory before moving the data to the cuda array
373 // cudaMallocPitch(
374 
375  cudaMemcpy3DParms p = {0};
376  p.dstArray = array;
377  int2 imgpos = getImagePos(img);
378 
379  //The srcPos and dstPos fields are optional offsets into the source and destination objects and are defined in units of each object's elements.
380  // The element for a host or device pointer is assumed to be unsigned char. For CUDA arrays, positions must be in the range [0, 2048) for any dimension.
381  p.dstPos.z = layer;
382  p.dstPos.x = imgpos.x;
383  p.dstPos.y = imgpos.y;
384  p.extent = make_cudaExtent(imgw,imgh,1);
385  p.kind = cudaMemcpyHostToDevice;
386  p.srcPtr = make_cudaPitchedPtr(src, sizeof(T)*imgw, sizeof(T)*imgw, imgh);
387  if (async)
388  CheckCUDAError( cudaMemcpy3DAsync(&p, s) );
389  else
390  CheckCUDAError( cudaMemcpy3D(&p) );
391  }
392 
393  void free() {
394  if (array) {
395  CheckCUDAError( cudaFreeArray(array) );
396  array = 0;
397  }
398  }
399 };
400 
401 
402 
403 template<typename T>
405 {
406 public:
407  struct KernelParams {
408  T* d_data;
409  size_t pitch;
410  int cols;
411  int depth;
412  int imgw, imgh;
413 
414  CUBOTH int2 GetImagePos(int z, int l) {
415  int img = z+depth*l;
416  return make_int2( (img % cols) * imgw, (img / cols) * imgh);
417  }
418  };
419 
421  int layers, totalImg;
422  int rows;
423 
424  Image4DMemory(int w, int h, int d, int L) {
425  kp.imgh = h;
426  kp.imgw = w;
427  kp.depth = d;
428  layers = L;
429  totalImg = d*L;
430 
431  rows = 2048 / kp.imgh;
432  kp.cols = (totalImg + rows - 1) / rows;
433 
434  CheckCUDAError( cudaMallocPitch (&kp.d_data, &kp.pitch, sizeof(T) * kp.cols * kp.imgw, rows * kp.imgh) );
435  }
436 
438  free();
439  }
440  void free(){
441  if(kp.d_data) cudaFree(kp.d_data);
442  kp.d_data=0;
443  }
444 
445  void copyToDevice(T* src, bool async=false, cudaStream_t s=0)
446  {
447  for (int L=0;L<layers;L++) {
448  for (int i=0;i<kp.depth;i++)
449  copyImageToDevice(i, L, &src[ kp.imgw * kp.imgh * ( kp.depth * L + i ) ], async, s);
450  }
451  }
452 
453  void copyToHost(T* dst, bool async=false, cudaStream_t s=0)
454  {
455  for (int L=0;L<layers;L++) {
456  for (int i=0;i<kp.depth;i++)
457  copyImageToHost(i, L, &dst[ kp.imgw * kp.imgh * ( kp.depth * L + i ) ], async, s);
458  }
459  }
460 
461 
462  void clear()
463  {
464  cudaMemset2D(kp.d_data, kp.pitch, 0, sizeof(T)*(kp.cols*kp.imgw), rows*kp.imgh);
465  }
466 
467  float* getImgAddr(int2 imgpos)
468  {
469  char* d = (char*)kp.d_data;
470  d += imgpos.y * kp.pitch;
471  return &((float*)d)[imgpos.x];
472  }
473 
474  // Copy a single subimage to the host
475  void copyImageToHost(int z, int l, T* dst, bool async=false, cudaStream_t s=0)
476  {
477  int2 imgpos = kp.GetImagePos(z, l);
478  if (async)
479  cudaMemcpy2DAsync(dst, sizeof(T)*kp.imgw, getImgAddr(imgpos), kp.pitch, kp.imgw * sizeof(T), kp.imgh, cudaMemcpyDeviceToHost, s);
480  else
481  cudaMemcpy2D(dst, sizeof(T)*kp.imgw, getImgAddr(imgpos), kp.pitch, kp.imgw * sizeof(T), kp.imgh, cudaMemcpyDeviceToHost);
482  }
483 
484  void copyImageToDevice(int z, int l, T* src, bool async=false, cudaStream_t s=0)
485  {
486  int2 imgpos = kp.GetImagePos(z, l);
487  if (async)
488  cudaMemcpy2DAsync(getImgAddr(imgpos), kp.pitch, src, sizeof(T)*kp.imgw, sizeof(T)*kp.imgw, kp.imgh, cudaMemcpyHostToDevice, s);
489  else
490  cudaMemcpy2D(getImgAddr(imgpos), kp.pitch, src, sizeof(T)*kp.imgw, sizeof(T)*kp.imgw, kp.imgh, cudaMemcpyHostToDevice);
491  }
492 
493  // no binding required
494  KernelParams bind() { return kp; }
495  void unbind() {}
496 
497  static __device__ T read(const KernelParams& kp, int x, int y, int2 imgpos) {
498  return ((T*)( (char*)kp.d_data + (y + imgpos.y) * kp.pitch))[ x + imgpos.x ];
499  }
500  static __device__ void write(T value, const KernelParams& kp, int x, int y, int2 imgpos) {
501  ((T*)( (char*)kp.d_data + (y + imgpos.y) * kp.pitch)) [ x + imgpos.x ] = value;
502  }
503 };
504 
505 
CUBOTH int numpixels()
Definition: cudaImageList.h:22
static __device__ T read(const KernelParams &kp, int x, int y, int2 imgpos)
void copyImageToHost(int img, T *dst, bool async=false, cudaStream_t s=0)
Definition: cudaImageList.h:97
void copyToDevice(T *src, bool async=false, cudaStream_t s=0)
void copyToDevice(T *src, bool async=false, cudaStream_t s=0)
float * getImgAddr(int2 imgpos)
static __device__ void write(T value, const KernelParams &kp, int x, int y, int2 imgpos)
void unbind(texture< T, cudaTextureType2DLayered, cudaReadModeElementType > &texref)
static cudaImageList< T > alloc(int w, int h, int amount)
Definition: cudaImageList.h:35
void copyImageToDevice(int img, int layer, T *src, bool async=false, cudaStream_t s=0)
CUBOTH int totalNumPixels()
Stores a large number of small images into a single large memory space. Optimizes GPU memory copies...
Definition: cudaImageList.h:10
CUBOTH void computeImagePos(int &x, int &y, int idx)
cudaArray_t array
void copyImageToDevice(int img, T *src, bool async=false, cudaStream_t s=0)
CUBOTH int fullwidth()
Definition: cudaImageList.h:16
void bind(texture< T, cudaTextureType2D, cudaReadModeElementType > &texref)
CUBOTH bool isEmpty()
Definition: cudaImageList.h:33
void bind(texture< T, cudaTextureType2DLayered, cudaReadModeElementType > &texref)
void copyImageToDevice(int z, int l, T *src, bool async=false, cudaStream_t s=0)
void CheckCUDAError(cufftResult_t err)
Definition: gpu_utils.h:36
void copyToDevice(T *src, int numImages, bool async=false, cudaStream_t s=0)
__device__ T readSurfacePixel(surface< void, cudaSurfaceType2DLayered > surf, int x, int y, int z)
cudaExtent getExtent()
CUBOTH int capacity()
Definition: cudaImageList.h:21
void copyImageToHost(int img, int layer, T *dst, bool async=false, cudaStream_t s=0)
surface< void, cudaSurfaceType2DLayered > image_lut_surface
Definition: Kernels.h:3
void bind(surface< void, cudaSurfaceType2DLayered > &surf)
int2 getImagePos(int image)
CUBOTH int totalNumBytes()
void allocateHostImageBuffer(pinned_array< T, Flags > &hostImgBuf)
Definition: cudaImageList.h:47
void init(int n)
Definition: gpu_utils.h:232
KernelParams kp
#define CUBOTH
Kernel is callable from both device and host code.
Definition: gpu_utils.h:16
__device__ void writeSurfacePixel(surface< void, cudaSurfaceType2DLayered > surf, int x, int y, int z, T value)
void copyToHost(T *dst, bool async=false, cudaStream_t s=0)
void dbgprintf(const char *fmt,...)
Definition: utils.cpp:149
CUBOTH T pixel_oobcheck(int x, int y, int imgIndex, T border=0.0f)
Definition: cudaImageList.h:55
CUBOTH T interpolate(float x, float y, int idx, bool &outside)
CUBOTH T & pixel(int x, int y, int imgIndex)
Definition: cudaImageList.h:64
CUBOTH T * pixelAddress(int x, int y, int imgIndex)
Definition: cudaImageList.h:71
CUBOTH int2 GetImagePos(int z, int l)
static CUBOTH T interp(T a, T b, float x)
KernelParams bind()
void copyToDevice(T *src, bool async=false, cudaStream_t s=0)
void unbind(texture< T, cudaTextureType2D, cudaReadModeElementType > &texref)
void copyImageToHost(int z, int l, T *dst, bool async=false, cudaStream_t s=0)
void copyToHost(T *dst, bool async=false, cudaStream_t s=0)
__device__ T interpolateFromTexture(texture< T, cudaTextureType2D, cudaReadModeElementType > texref, float x, float y, int idx, bool &outside)
Image4DCudaArray(int sx, int sy, int numImg, int sL)
CUBOTH bool boundaryHit(float2 center, float radius)
Definition: cudaImageList.h:79
CUBOTH int fullheight()
Definition: cudaImageList.h:17
CUBOTH int2 getImagePos(int image)
void copyToHost(T *dst, bool async=false, cudaStream_t s=0)
std::string SPrintf(const char *fmt,...)
Definition: utils.cpp:132
Image4DMemory(int w, int h, int d, int L)
KernelInst kernelInst()
static cudaImageList< T > emptyList()
Definition: cudaImageList.h:24