37         imgl.
w = 
w; imgl.
h = 
h;
    41             throw std::bad_alloc(
SPrintf(
"cudaImageListf<%s> alloc %dx%dx%d failed", 
typeid(T).name(), w, h, amount).c_str());
    52         return (T*)(((
char*)data) + pitch*h*i);
    56         if (x < 0 || x >= w || y < 0 || y >= h)
    60         T* row = (T*) ( (
char*)data + y*
pitch );
    66         T* row = (T*) ( (
char*)data + y*
pitch );
    73         T* row = (T*) ( (
char*)data + y*
pitch );
    81         return center.x + radius >= w ||
    82             center.x - radius < 0 ||
    83             center.y + radius >= h ||
    84             center.y - radius < 0;
   101             cudaMemcpy2DAsync(dst, 
sizeof(T)*w, src, pitch, w*
sizeof(T), h, cudaMemcpyDeviceToHost, s);
   103             cudaMemcpy2D(dst, 
sizeof(T)*w, src, pitch, w*
sizeof(T), h, cudaMemcpyDeviceToHost);
   110             cudaMemcpy2DAsync(dst, pitch, src, w*
sizeof(T), w*
sizeof(T), h, cudaMemcpyHostToDevice, s);
   112             cudaMemcpy2D(dst, pitch, src, w*
sizeof(T), w*
sizeof(T), h, cudaMemcpyHostToDevice);
   115     void copyToHost(T* dst, 
bool async=
false, cudaStream_t s=0) {
   117             cudaMemcpy2DAsync(dst, 
sizeof(T)*w, data, pitch, w*
sizeof(T), count*h, cudaMemcpyDeviceToHost, s);
   119             cudaMemcpy2D(dst, 
sizeof(T)*w, data, pitch, w*
sizeof(T), count*h, cudaMemcpyDeviceToHost);
   124             cudaMemcpy2DAsync(data, pitch, src, w*
sizeof(T), w*
sizeof(T), count*h, cudaMemcpyHostToDevice, s);
   126             cudaMemcpy2D(data, pitch, src, w*
sizeof(T), w*
sizeof(T), count*h, cudaMemcpyHostToDevice);
   129     void copyToDevice(T* src, 
int numImages, 
bool async=
false, cudaStream_t s=0) {
   131             cudaMemcpy2DAsync(data, pitch, src, w*
sizeof(T), w*
sizeof(T), numImages*h, cudaMemcpyHostToDevice, s);
   133             cudaMemcpy2D(data, pitch, src, w*
sizeof(T), w*
sizeof(T), numImages*h, cudaMemcpyHostToDevice);
   137         if(data) cudaMemset2D(data, pitch, 0, w*
sizeof(T), count*h);
   143     CUBOTH static inline T 
interp(T a, T b, 
float x) { 
return a + (b-a)*x; }
   149         if (rx < 0 || ry < 0 || rx >= w-1 || ry >= h-1) {
   154         T v00 = 
pixel(rx, ry, idx);
   155         T v10 = 
pixel(rx+1, ry, idx);
   156         T v01 = 
pixel(rx, ry+1, idx);
   157         T v11 = 
pixel(rx+1, ry+1, idx);
   159         T v0 = 
interp (v00, v10, x-rx);
   160         T v1 = 
interp (v01, v11, x-rx);
   163         return interp (v0, v1, y-ry);
   166     void bind(texture<T, cudaTextureType2D, cudaReadModeElementType>& texref) {
   167         cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
   168         cudaBindTexture2D(NULL, &texref, data, &desc, w, h*count, pitch);
   170     void unbind(texture<T, cudaTextureType2D, cudaReadModeElementType>& texref) {
   171         cudaUnbindTexture(&texref);
   180     __device__ T 
interpolateFromTexture(texture<T, cudaTextureType2D, cudaReadModeElementType> texref, 
float x,
float y, 
int idx, 
bool& outside)
   184         if (rx < 0 || ry < 0 || rx >= w-1 || ry >= h-1) {
   191         float fx=x-floor(x), fy = y-floor(y);
   195         T v00 = tex2D(texref, u, v);
   196         T v10 = tex2D(texref, u+1, v);
   197         T v01 = tex2D(texref, u, v+1);
   198         T v11 = tex2D(texref, u+1, v+1);
   200         T v0 = 
interp (v00, v10, fx);
   201         T v1 = 
interp (v01, v11, fx);
   204         return interp (v0, v1, fy);
   223         return make_cudaExtent(imgw * layerw, imgh * layerh, nlayers);
   233             return make_int2(imgw * (image % layerw), imgh * (image / layerw));
   236         __device__ T 
readSurfacePixel(surface<void, cudaSurfaceType2DLayered> surf, 
int x, 
int y,
int z)
   239             surf2DLayeredread (&r, 
image_lut_surface, 
sizeof(T)*x, y, z, cudaBoundaryModeTrap);
   243         __device__ 
void writeSurfacePixel(surface<void, cudaSurfaceType2DLayered> surf, 
int x,
int y,
int z, T value)
   245             surf2DLayeredwrite(value, surf, 
sizeof(T)*x, y, z, cudaBoundaryModeTrap);
   256     void bind(texture<T, cudaTextureType2DLayered, cudaReadModeElementType>& texref) {
   257         cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
   260     void unbind(texture<T, cudaTextureType2DLayered, cudaReadModeElementType>& texref) {
   261         cudaUnbindTexture(texref);
   264     void bind(surface<void, cudaSurfaceType2DLayered>& surf) {
   265         cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
   276         cudaGetDeviceProperties(&prop, d);
   280         this->numImg = numImg;
   283         layerh = 2048 / imgh;
   284         layerw = (numImg + layerh - 1) / layerh;
   287         dbgprintf(
"creating image4D: %d layers of %d x %d images of %d x %d (%dx%dx%d)\n", 
   288             sL, layerw, layerh, imgw, imgh, getExtent().width,getExtent().height,getExtent().depth);
   290         cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
   291         cudaError_t err = cudaMalloc3DArray(&array, &desc, getExtent(), cudaArrayLayered | cudaArraySurfaceLoadStore);
   293         if (err != cudaSuccess) {
   294             throw std::bad_alloc(
SPrintf(
"CUDA error during cudaSurf2DList(): %s", cudaGetErrorString(err)).c_str());
   299         int2 pos = { imgw * ( image % layerw ), imgh * ( image / layerw ) };
   309         for (
int L=0;L<nlayers;L++) {
   310             for (
int i=0;i<numImg;i++)
   317         for (
int L=0;L<nlayers;L++) {
   318             for (
int i=0;i<numImg;i++)
   319                 copyImageToHost(i, L, &dst[ imgw * imgh * ( numImg * L + i ) ], async, s);
   328         CheckCUDAError( cudaMallocPitch(&d, &srcpitch, 
sizeof(T)*imgw, imgh) );
   329         CheckCUDAError( cudaMemset2D(d, srcpitch, 0, 
sizeof(T)*imgw, imgh) );
   331         cudaMemcpy3DParms p = {0};
   333         p.extent = make_cudaExtent(imgw,imgh,1);
   334         p.kind = cudaMemcpyDeviceToDevice;
   335         p.srcPtr = make_cudaPitchedPtr(d, srcpitch, 
sizeof(T)*imgw, imgh);
   336         for (
int l=0;l<nlayers;l++)
   337             for (
int img=0;img<numImg;img++) {
   338                 int2 imgpos = getImagePos(img);
   340                 p.dstPos.x = imgpos.x;
   341                 p.dstPos.y = imgpos.y;
   355         cudaMemcpy3DParms p = {0};
   357         p.extent = make_cudaExtent(imgw,imgh,1);
   358         p.kind = cudaMemcpyDeviceToHost;
   360         int2 imgpos = getImagePos(img);
   361         p.srcPos.x = imgpos.x;
   362         p.srcPos.y = imgpos.y;
   363         p.dstPtr = make_cudaPitchedPtr(dst, 
sizeof(T)*imgw, 
sizeof(T)*imgw, imgh);
   375         cudaMemcpy3DParms p = {0};
   377         int2 imgpos = getImagePos(img);
   382         p.dstPos.x = imgpos.x;
   383         p.dstPos.y = imgpos.y;
   384         p.extent = make_cudaExtent(imgw,imgh,1);
   385         p.kind = cudaMemcpyHostToDevice;
   386         p.srcPtr = make_cudaPitchedPtr(src, 
sizeof(T)*imgw, 
sizeof(T)*imgw, imgh);
   416             return make_int2( (img % cols) * imgw, (img / cols) * imgh);
   431         rows = 2048 / kp.
imgh;
   432         kp.
cols = (totalImg + rows - 1) / rows;
   447         for (
int L=0;L<layers;L++) {
   448             for (
int i=0;i<kp.
depth;i++)
   455         for (
int L=0;L<layers;L++) {
   456             for (
int i=0;i<kp.
depth;i++)
   469         char* d = (
char*)kp.
d_data;
   470         d += imgpos.y * kp.
pitch;
   471         return &((
float*)d)[imgpos.x];
   479             cudaMemcpy2DAsync(dst, 
sizeof(T)*kp.
imgw, getImgAddr(imgpos), kp.
pitch, kp.
imgw * 
sizeof(T), kp.
imgh, cudaMemcpyDeviceToHost, s);
   481             cudaMemcpy2D(dst, 
sizeof(T)*kp.
imgw, getImgAddr(imgpos), kp.
pitch, kp.
imgw * 
sizeof(T), kp.
imgh, cudaMemcpyDeviceToHost);
   488             cudaMemcpy2DAsync(getImgAddr(imgpos), kp.
pitch, src, 
sizeof(T)*kp.
imgw, 
sizeof(T)*kp.
imgw, kp.
imgh, cudaMemcpyHostToDevice, s);
   490             cudaMemcpy2D(getImgAddr(imgpos), kp.
pitch, src, 
sizeof(T)*kp.
imgw, 
sizeof(T)*kp.
imgw, kp.
imgh, cudaMemcpyHostToDevice);
   498         return ((T*)( (
char*)kp.
d_data + (y + imgpos.y) * kp.
pitch))[ x + imgpos.x ];
   501         ((T*)( (
char*)kp.
d_data + (y + imgpos.y) * kp.
pitch)) [ x + imgpos.x ] = value;
 
static __device__ T read(const KernelParams &kp, int x, int y, int2 imgpos)
 
void copyImageToHost(int img, T *dst, bool async=false, cudaStream_t s=0)
 
void copyToDevice(T *src, bool async=false, cudaStream_t s=0)
 
void copyToDevice(T *src, bool async=false, cudaStream_t s=0)
 
float * getImgAddr(int2 imgpos)
 
static __device__ void write(T value, const KernelParams &kp, int x, int y, int2 imgpos)
 
void unbind(texture< T, cudaTextureType2DLayered, cudaReadModeElementType > &texref)
 
static cudaImageList< T > alloc(int w, int h, int amount)
 
void copyImageToDevice(int img, int layer, T *src, bool async=false, cudaStream_t s=0)
 
CUBOTH int totalNumPixels()
 
Stores a large number of small images into a single large memory space. Optimizes GPU memory copies...
 
CUBOTH void computeImagePos(int &x, int &y, int idx)
 
void copyImageToDevice(int img, T *src, bool async=false, cudaStream_t s=0)
 
void bind(texture< T, cudaTextureType2D, cudaReadModeElementType > &texref)
 
void bind(texture< T, cudaTextureType2DLayered, cudaReadModeElementType > &texref)
 
void copyImageToDevice(int z, int l, T *src, bool async=false, cudaStream_t s=0)
 
void CheckCUDAError(cufftResult_t err)
 
void copyToDevice(T *src, int numImages, bool async=false, cudaStream_t s=0)
 
__device__ T readSurfacePixel(surface< void, cudaSurfaceType2DLayered > surf, int x, int y, int z)
 
void copyImageToHost(int img, int layer, T *dst, bool async=false, cudaStream_t s=0)
 
surface< void, cudaSurfaceType2DLayered > image_lut_surface
 
void bind(surface< void, cudaSurfaceType2DLayered > &surf)
 
int2 getImagePos(int image)
 
CUBOTH int totalNumBytes()
 
void allocateHostImageBuffer(pinned_array< T, Flags > &hostImgBuf)
 
#define CUBOTH
Kernel is callable from both device and host code. 
 
__device__ void writeSurfacePixel(surface< void, cudaSurfaceType2DLayered > surf, int x, int y, int z, T value)
 
void copyToHost(T *dst, bool async=false, cudaStream_t s=0)
 
void dbgprintf(const char *fmt,...)
 
CUBOTH T pixel_oobcheck(int x, int y, int imgIndex, T border=0.0f)
 
CUBOTH T interpolate(float x, float y, int idx, bool &outside)
 
CUBOTH T & pixel(int x, int y, int imgIndex)
 
CUBOTH T * pixelAddress(int x, int y, int imgIndex)
 
CUBOTH int2 GetImagePos(int z, int l)
 
static CUBOTH T interp(T a, T b, float x)
 
void copyToDevice(T *src, bool async=false, cudaStream_t s=0)
 
void unbind(texture< T, cudaTextureType2D, cudaReadModeElementType > &texref)
 
void copyImageToHost(int z, int l, T *dst, bool async=false, cudaStream_t s=0)
 
void copyToHost(T *dst, bool async=false, cudaStream_t s=0)
 
__device__ T interpolateFromTexture(texture< T, cudaTextureType2D, cudaReadModeElementType > texref, float x, float y, int idx, bool &outside)
 
Image4DCudaArray(int sx, int sy, int numImg, int sL)
 
CUBOTH bool boundaryHit(float2 center, float radius)
 
CUBOTH int2 getImagePos(int image)
 
void copyToHost(T *dst, bool async=false, cudaStream_t s=0)
 
std::string SPrintf(const char *fmt,...)
 
Image4DMemory(int w, int h, int d, int L)
 
static cudaImageList< T > emptyList()