37 imgl.
w =
w; imgl.
h =
h;
41 throw std::bad_alloc(
SPrintf(
"cudaImageListf<%s> alloc %dx%dx%d failed",
typeid(T).name(), w, h, amount).c_str());
52 return (T*)(((
char*)data) + pitch*h*i);
56 if (x < 0 || x >= w || y < 0 || y >= h)
60 T* row = (T*) ( (
char*)data + y*
pitch );
66 T* row = (T*) ( (
char*)data + y*
pitch );
73 T* row = (T*) ( (
char*)data + y*
pitch );
81 return center.x + radius >= w ||
82 center.x - radius < 0 ||
83 center.y + radius >= h ||
84 center.y - radius < 0;
101 cudaMemcpy2DAsync(dst,
sizeof(T)*w, src, pitch, w*
sizeof(T), h, cudaMemcpyDeviceToHost, s);
103 cudaMemcpy2D(dst,
sizeof(T)*w, src, pitch, w*
sizeof(T), h, cudaMemcpyDeviceToHost);
110 cudaMemcpy2DAsync(dst, pitch, src, w*
sizeof(T), w*
sizeof(T), h, cudaMemcpyHostToDevice, s);
112 cudaMemcpy2D(dst, pitch, src, w*
sizeof(T), w*
sizeof(T), h, cudaMemcpyHostToDevice);
115 void copyToHost(T* dst,
bool async=
false, cudaStream_t s=0) {
117 cudaMemcpy2DAsync(dst,
sizeof(T)*w, data, pitch, w*
sizeof(T), count*h, cudaMemcpyDeviceToHost, s);
119 cudaMemcpy2D(dst,
sizeof(T)*w, data, pitch, w*
sizeof(T), count*h, cudaMemcpyDeviceToHost);
124 cudaMemcpy2DAsync(data, pitch, src, w*
sizeof(T), w*
sizeof(T), count*h, cudaMemcpyHostToDevice, s);
126 cudaMemcpy2D(data, pitch, src, w*
sizeof(T), w*
sizeof(T), count*h, cudaMemcpyHostToDevice);
129 void copyToDevice(T* src,
int numImages,
bool async=
false, cudaStream_t s=0) {
131 cudaMemcpy2DAsync(data, pitch, src, w*
sizeof(T), w*
sizeof(T), numImages*h, cudaMemcpyHostToDevice, s);
133 cudaMemcpy2D(data, pitch, src, w*
sizeof(T), w*
sizeof(T), numImages*h, cudaMemcpyHostToDevice);
137 if(data) cudaMemset2D(data, pitch, 0, w*
sizeof(T), count*h);
143 CUBOTH static inline T
interp(T a, T b,
float x) {
return a + (b-a)*x; }
149 if (rx < 0 || ry < 0 || rx >= w-1 || ry >= h-1) {
154 T v00 =
pixel(rx, ry, idx);
155 T v10 =
pixel(rx+1, ry, idx);
156 T v01 =
pixel(rx, ry+1, idx);
157 T v11 =
pixel(rx+1, ry+1, idx);
159 T v0 =
interp (v00, v10, x-rx);
160 T v1 =
interp (v01, v11, x-rx);
163 return interp (v0, v1, y-ry);
166 void bind(texture<T, cudaTextureType2D, cudaReadModeElementType>& texref) {
167 cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
168 cudaBindTexture2D(NULL, &texref, data, &desc, w, h*count, pitch);
170 void unbind(texture<T, cudaTextureType2D, cudaReadModeElementType>& texref) {
171 cudaUnbindTexture(&texref);
180 __device__ T
interpolateFromTexture(texture<T, cudaTextureType2D, cudaReadModeElementType> texref,
float x,
float y,
int idx,
bool& outside)
184 if (rx < 0 || ry < 0 || rx >= w-1 || ry >= h-1) {
191 float fx=x-floor(x), fy = y-floor(y);
195 T v00 = tex2D(texref, u, v);
196 T v10 = tex2D(texref, u+1, v);
197 T v01 = tex2D(texref, u, v+1);
198 T v11 = tex2D(texref, u+1, v+1);
200 T v0 =
interp (v00, v10, fx);
201 T v1 =
interp (v01, v11, fx);
204 return interp (v0, v1, fy);
223 return make_cudaExtent(imgw * layerw, imgh * layerh, nlayers);
233 return make_int2(imgw * (image % layerw), imgh * (image / layerw));
236 __device__ T
readSurfacePixel(surface<void, cudaSurfaceType2DLayered> surf,
int x,
int y,
int z)
239 surf2DLayeredread (&r,
image_lut_surface,
sizeof(T)*x, y, z, cudaBoundaryModeTrap);
243 __device__
void writeSurfacePixel(surface<void, cudaSurfaceType2DLayered> surf,
int x,
int y,
int z, T value)
245 surf2DLayeredwrite(value, surf,
sizeof(T)*x, y, z, cudaBoundaryModeTrap);
256 void bind(texture<T, cudaTextureType2DLayered, cudaReadModeElementType>& texref) {
257 cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
260 void unbind(texture<T, cudaTextureType2DLayered, cudaReadModeElementType>& texref) {
261 cudaUnbindTexture(texref);
264 void bind(surface<void, cudaSurfaceType2DLayered>& surf) {
265 cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
276 cudaGetDeviceProperties(&prop, d);
280 this->numImg = numImg;
283 layerh = 2048 / imgh;
284 layerw = (numImg + layerh - 1) / layerh;
287 dbgprintf(
"creating image4D: %d layers of %d x %d images of %d x %d (%dx%dx%d)\n",
288 sL, layerw, layerh, imgw, imgh, getExtent().width,getExtent().height,getExtent().depth);
290 cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
291 cudaError_t err = cudaMalloc3DArray(&array, &desc, getExtent(), cudaArrayLayered | cudaArraySurfaceLoadStore);
293 if (err != cudaSuccess) {
294 throw std::bad_alloc(
SPrintf(
"CUDA error during cudaSurf2DList(): %s", cudaGetErrorString(err)).c_str());
299 int2 pos = { imgw * ( image % layerw ), imgh * ( image / layerw ) };
309 for (
int L=0;L<nlayers;L++) {
310 for (
int i=0;i<numImg;i++)
317 for (
int L=0;L<nlayers;L++) {
318 for (
int i=0;i<numImg;i++)
319 copyImageToHost(i, L, &dst[ imgw * imgh * ( numImg * L + i ) ], async, s);
328 CheckCUDAError( cudaMallocPitch(&d, &srcpitch,
sizeof(T)*imgw, imgh) );
329 CheckCUDAError( cudaMemset2D(d, srcpitch, 0,
sizeof(T)*imgw, imgh) );
331 cudaMemcpy3DParms p = {0};
333 p.extent = make_cudaExtent(imgw,imgh,1);
334 p.kind = cudaMemcpyDeviceToDevice;
335 p.srcPtr = make_cudaPitchedPtr(d, srcpitch,
sizeof(T)*imgw, imgh);
336 for (
int l=0;l<nlayers;l++)
337 for (
int img=0;img<numImg;img++) {
338 int2 imgpos = getImagePos(img);
340 p.dstPos.x = imgpos.x;
341 p.dstPos.y = imgpos.y;
355 cudaMemcpy3DParms p = {0};
357 p.extent = make_cudaExtent(imgw,imgh,1);
358 p.kind = cudaMemcpyDeviceToHost;
360 int2 imgpos = getImagePos(img);
361 p.srcPos.x = imgpos.x;
362 p.srcPos.y = imgpos.y;
363 p.dstPtr = make_cudaPitchedPtr(dst,
sizeof(T)*imgw,
sizeof(T)*imgw, imgh);
375 cudaMemcpy3DParms p = {0};
377 int2 imgpos = getImagePos(img);
382 p.dstPos.x = imgpos.x;
383 p.dstPos.y = imgpos.y;
384 p.extent = make_cudaExtent(imgw,imgh,1);
385 p.kind = cudaMemcpyHostToDevice;
386 p.srcPtr = make_cudaPitchedPtr(src,
sizeof(T)*imgw,
sizeof(T)*imgw, imgh);
416 return make_int2( (img % cols) * imgw, (img / cols) * imgh);
431 rows = 2048 / kp.
imgh;
432 kp.
cols = (totalImg + rows - 1) / rows;
447 for (
int L=0;L<layers;L++) {
448 for (
int i=0;i<kp.
depth;i++)
455 for (
int L=0;L<layers;L++) {
456 for (
int i=0;i<kp.
depth;i++)
469 char* d = (
char*)kp.
d_data;
470 d += imgpos.y * kp.
pitch;
471 return &((
float*)d)[imgpos.x];
479 cudaMemcpy2DAsync(dst,
sizeof(T)*kp.
imgw, getImgAddr(imgpos), kp.
pitch, kp.
imgw *
sizeof(T), kp.
imgh, cudaMemcpyDeviceToHost, s);
481 cudaMemcpy2D(dst,
sizeof(T)*kp.
imgw, getImgAddr(imgpos), kp.
pitch, kp.
imgw *
sizeof(T), kp.
imgh, cudaMemcpyDeviceToHost);
488 cudaMemcpy2DAsync(getImgAddr(imgpos), kp.
pitch, src,
sizeof(T)*kp.
imgw,
sizeof(T)*kp.
imgw, kp.
imgh, cudaMemcpyHostToDevice, s);
490 cudaMemcpy2D(getImgAddr(imgpos), kp.
pitch, src,
sizeof(T)*kp.
imgw,
sizeof(T)*kp.
imgw, kp.
imgh, cudaMemcpyHostToDevice);
498 return ((T*)( (
char*)kp.
d_data + (y + imgpos.y) * kp.
pitch))[ x + imgpos.x ];
501 ((T*)( (
char*)kp.
d_data + (y + imgpos.y) * kp.
pitch)) [ x + imgpos.x ] = value;
static __device__ T read(const KernelParams &kp, int x, int y, int2 imgpos)
void copyImageToHost(int img, T *dst, bool async=false, cudaStream_t s=0)
void copyToDevice(T *src, bool async=false, cudaStream_t s=0)
void copyToDevice(T *src, bool async=false, cudaStream_t s=0)
float * getImgAddr(int2 imgpos)
static __device__ void write(T value, const KernelParams &kp, int x, int y, int2 imgpos)
void unbind(texture< T, cudaTextureType2DLayered, cudaReadModeElementType > &texref)
static cudaImageList< T > alloc(int w, int h, int amount)
void copyImageToDevice(int img, int layer, T *src, bool async=false, cudaStream_t s=0)
CUBOTH int totalNumPixels()
Stores a large number of small images into a single large memory space. Optimizes GPU memory copies...
CUBOTH void computeImagePos(int &x, int &y, int idx)
void copyImageToDevice(int img, T *src, bool async=false, cudaStream_t s=0)
void bind(texture< T, cudaTextureType2D, cudaReadModeElementType > &texref)
void bind(texture< T, cudaTextureType2DLayered, cudaReadModeElementType > &texref)
void copyImageToDevice(int z, int l, T *src, bool async=false, cudaStream_t s=0)
void CheckCUDAError(cufftResult_t err)
void copyToDevice(T *src, int numImages, bool async=false, cudaStream_t s=0)
__device__ T readSurfacePixel(surface< void, cudaSurfaceType2DLayered > surf, int x, int y, int z)
void copyImageToHost(int img, int layer, T *dst, bool async=false, cudaStream_t s=0)
surface< void, cudaSurfaceType2DLayered > image_lut_surface
void bind(surface< void, cudaSurfaceType2DLayered > &surf)
int2 getImagePos(int image)
CUBOTH int totalNumBytes()
void allocateHostImageBuffer(pinned_array< T, Flags > &hostImgBuf)
#define CUBOTH
Kernel is callable from both device and host code.
__device__ void writeSurfacePixel(surface< void, cudaSurfaceType2DLayered > surf, int x, int y, int z, T value)
void copyToHost(T *dst, bool async=false, cudaStream_t s=0)
void dbgprintf(const char *fmt,...)
CUBOTH T pixel_oobcheck(int x, int y, int imgIndex, T border=0.0f)
CUBOTH T interpolate(float x, float y, int idx, bool &outside)
CUBOTH T & pixel(int x, int y, int imgIndex)
CUBOTH T * pixelAddress(int x, int y, int imgIndex)
CUBOTH int2 GetImagePos(int z, int l)
static CUBOTH T interp(T a, T b, float x)
void copyToDevice(T *src, bool async=false, cudaStream_t s=0)
void unbind(texture< T, cudaTextureType2D, cudaReadModeElementType > &texref)
void copyImageToHost(int z, int l, T *dst, bool async=false, cudaStream_t s=0)
void copyToHost(T *dst, bool async=false, cudaStream_t s=0)
__device__ T interpolateFromTexture(texture< T, cudaTextureType2D, cudaReadModeElementType > texref, float x, float y, int idx, bool &outside)
Image4DCudaArray(int sx, int sy, int numImg, int sL)
CUBOTH bool boundaryHit(float2 center, float radius)
CUBOTH int2 getImagePos(int image)
void copyToHost(T *dst, bool async=false, cudaStream_t s=0)
std::string SPrintf(const char *fmt,...)
Image4DMemory(int w, int h, int d, int L)
static cudaImageList< T > emptyList()