Class to specifically perform quadrant interpolation calculations on the GPU. More...

#include <QI.h>

Classes
struct	DeviceInstance
	Contains all QI memory space that is allocated per device (shared between streams). More...

struct	StreamInstance
	Contains all QI memory space that is allocated per stream. More...

Public Member Functions
template<typename TImageSampler >
void	Execute (BaseKernelParams &p, const QTrkComputedConfig &cfg, StreamInstance s, DeviceInstance d, device_vec< float3 > initial, device_vec< float3 > output)
	Execute a batch of QI calculations. Runs Iterate the correct number of times and recalculates the number of radial spokes for every iteration, based on QTrkComputedConfig::qi_angstep_factor. More...

void	InitDevice (DeviceInstance *d, QTrkComputedConfig &cc)
	Ready a device for QI calculations. More...

void	InitStream (StreamInstance *s, QTrkComputedConfig &cc, cudaStream_t stream, int batchSize)
	Ready a stream for QI calculations. More...

void	Init (QTrkComputedConfig &cfg, int batchSize)
	Initialize this QI instance. More...

Private Member Functions
template<typename TImageSampler >
void	Iterate (BaseKernelParams &p, device_vec< float3 > initial, device_vec< float3 > output, StreamInstance s, DeviceInstance d, int angularSteps)
	Perform one QI iteration. More...

dim3	blocks (int njobs)
	Same function as QueuedCUDATracker::blocks. More...

dim3	threads ()
	Same function as QueuedCUDATracker::threads. More...

Private Attributes
QIParams	params
	Structure with settings relevant to quadrant interpolation. More...

int	qi_FFT_length
	Parameter for length required for arrays going into FFT. 2 * radial steps. More...

int	batchSize
	See QueuedCUDATracker::batchSize. Local copy. More...

int	numThreads
	See QueuedCUDATracker::numThreads. More...

Detailed Description

Class to specifically perform quadrant interpolation calculations on the GPU.

Maintains all memory and functions related to QI to keep QueuedCUDATracker more clean.

Definition at line 20 of file QI.h.

Member Function Documentation

§ blocks()

dim3 QI::blocks ( int njobs )

inlineprivate

Same function as QueuedCUDATracker::blocks.

Definition at line 153 of file QI.h.

153 { return dim3((njobs+numThreads-1)/numThreads); }

QI::numThreads

int numThreads

See QueuedCUDATracker::numThreads.

Definition: QI.h:150

§ Execute()

template<typename TImageSampler >

void QI::Execute	(	BaseKernelParams &	p,
		const QTrkComputedConfig &	cfg,
		QI::StreamInstance *	s,
		QI::DeviceInstance *	d,
		device_vec< float3 > *	initial,
		device_vec< float3 > *	output
	)

Execute a batch of QI calculations. Runs Iterate the correct number of times and recalculates the number of radial spokes for every iteration, based on QTrkComputedConfig::qi_angstep_factor.

Parameters

[in]	p	Reference to BaseKernelParams with parameters for this call.
[in]	cfg	Reference to the settings to use.
[in]	s	The QI::StreamInstance to use.
[in]	d	The QI::DeviceInstance to use.
[in]	initial	Vector with the initial positions from which to start the radial samplings on the first iteration.
[out]	output	Vector that will be filled with the final results.

Definition at line 268 of file QI_impl.h.

 {
     float angsteps = cfg.qi_angstepspq / powf(cfg.qi_angstep_factor, cfg.qi_iterations);
 
     for (int a=0;a<cfg.qi_iterations;a++) {
         device_vec<float3>* dst = a==0 ? initial : output;
         Iterate< TImageSampler > (p, dst, output, s, d, std::max(MIN_RADPROFILE_SMP_COUNT, (int)angsteps) );
         angsteps *= cfg.qi_angstep_factor;
     }
 }

§ Init()

void QI::Init	(	QTrkComputedConfig &	cfg,
		int	batchSize
	)

Initialize this QI instance.

Copy relevant settings to local datastructures.

Parameters

[in]	cfg	The configuration used for the algorithm.
[in]	batchSize	The calculation batch size. See batchSize.

Definition at line 387 of file QI_impl.h.

 {
     QIParams& qi = params;
     qi.maxAngularSteps = cfg.qi_angstepspq;
     qi.iterations = cfg.qi_iterations;
     qi.maxRadius = cfg.qi_maxradius;
     qi.minRadius = cfg.qi_minradius;
     qi.radialSteps = cfg.qi_radialsteps;
 
     cudaDeviceProp prop;
     cudaGetDeviceProperties(&prop, 0);
     numThreads = prop.warpSize;
 
     this->batchSize = batchSize;
 }

§ InitDevice()

void QI::InitDevice	(	DeviceInstance *	d,
		QTrkComputedConfig &	cc
	)

Ready a device for QI calculations.

Calculate required parameters and fill a DeviceInstance.

Parameters

[in,out]	d	The instance to be initialized.
[in]	cc	The configuration used for the algorithm.

Definition at line 344 of file QI_impl.h.

 {
     std::vector<float2> qi_radialgrid(cc.qi_angstepspq);
     for (int i=0;i<cc.qi_angstepspq;i++)  {
         float ang = 0.5f*3.141593f*(i+0.5f)/(float)cc.qi_angstepspq;
         qi_radialgrid[i]=make_float2(cos(ang), sin(ang));
     }
     d->qi_trigtable = qi_radialgrid;
 
     std::vector<float> rweights = ComputeRadialBinWindow(cc.qi_radialsteps);
     d->d_radialweights = rweights;
 
     QIParams dp = params;
     dp.cos_sin_table = d->qi_trigtable.data;
 
     cudaMalloc(&d->d_qiparams, sizeof(QIParams));
     cudaMemcpy(d->d_qiparams, &dp, sizeof(QIParams), cudaMemcpyHostToDevice);
 }

§ InitStream()

void QI::InitStream	(	StreamInstance *	s,
		QTrkComputedConfig &	cc,
		cudaStream_t	stream,
		int	batchSize
	)

Ready a stream for QI calculations.

Calculate required parameters and fill a StreamInstance.

Parameters

[in,out]	s	The instance to be initialized.
[in]	cc	The configuration used for the algorithm.
[in]	stream	The normal CUDA stream this QI Stream will relate to.
[in]	batchSize	The calculation batch size. See batchSize.

Definition at line 363 of file QI_impl.h.

 {
     int fftlen = cc.qi_radialsteps*2;
     s->stream = stream;
     s->d_quadrants.init(fftlen*batchSize*2); // 4 quadrants * radialSteps * batchSize
     s->d_QIprofiles.init(batchSize*2*fftlen); // (2 axis) * (2 radialsteps) = 4 * nr = 2 * fftlen
     s->d_QIprofiles_reverse.init(batchSize*2*fftlen);
     s->d_shiftbuffer.init(fftlen * batchSize);
 
     // 2* batchSize, since X & Y both need an FFT transform
     // cufftPlanMany and 1d with batch argument are equivalent in memory usage and speed
     // Using Many because the batch argument for 1d is strictly speaking deprecated (see cufft.h)
     cufftResult_t r = cufftPlanMany(&s->fftPlan, 1, &fftlen, 0, 1, fftlen, 0, 1, fftlen, CUFFT_C2C, batchSize*2);
     //cufftResult_t r = cufftPlan1d(&s->fftPlan, fftlen, CUFFT_C2C, batchSize*2);
 
     if(r != CUFFT_SUCCESS) {
         throw std::runtime_error( SPrintf("CUFFT plan creation failed. FFT len: %d. Batchsize: %d\n", fftlen, batchSize*2));
     }
     CheckCUDAError(cufftSetCompatibilityMode(s->fftPlan, CUFFT_COMPATIBILITY_NATIVE));
     CheckCUDAError(cufftSetStream(s->fftPlan, stream));
 
     this->qi_FFT_length = fftlen;
 }

§ Iterate()

template<typename TImageSampler >

void QI::Iterate	(	BaseKernelParams &	p,
		device_vec< float3 > *	initial,
		device_vec< float3 > *	output,
		StreamInstance *	s,
		DeviceInstance *	d,
		int	angularSteps
	)

private

Perform one QI iteration.

This is where the actual algorithm is executed. See [3] for details.

The kernels are called in the following order:

QI_ComputeQuadrants - Calculate the 4 quadrant profiles
QI_QuadrantsToProfiles - Convert the quadrant profiles into the concatenated and their respective reverse profiles
Forward FFT (CUFFT) - Calculate the fourier transforms of the profiles and reverse profiles
QI_MultiplyWithConjugate - Multiply the transforms to calculate their autocorrelation
Reverse FFT (CUFFT) - Transform the multiplied transforms back to the time-domain autocorrelation
QI_OffsetPositions - Calculate and apply the X and Y shifts from the autocorrelations

Parameters

[in]	p	KernelParamaters to use.
[in]	initial	The initial positions to use as sampling centers.
[out]	output	Vector with the resulting positions.
[in]	s	The stream to execute.
[in]	d	The device on which to execute.
[in]	angularSteps	The number of angular steps to use.

Huge speedup achieved by calculating each quadrant's radial bin in a seperate thread. This leads to more, smaller kernels running concurrently so more memory latency hiding can be achieved through higher occupancy. For a visual display of this, try running the NVidia Visual Profiler with both versions of quadrant profile kernels.

Definition at line 280 of file QI_impl.h.

 {
     /* Old method of calculating quadrant profiles.
     QIParams dp = params;
     dp.cos_sin_table = d->qi_trigtable.data;
 
     QI_ComputeProfile <TImageSampler> <<< blocks(p.njobs), threads(), 0, s->stream >>> (p, initial->data, 
         s->d_quadrants.data, s->d_QIprofiles.data, s->d_QIprofiles_reverse.data, dp, d->d_radialweights.data, angularSteps);
     // DbgOutputVectorToFile("D:\\TestImages\\gpu_qi_prof_old.csv", s->d_quadrants, true);
     */
 
     dim3 qdrThreads(16, 8, 4);
     dim3 qdrDim( (p.njobs + qdrThreads.x - 1) / qdrThreads.x, (params.radialSteps + qdrThreads.y - 1) / qdrThreads.y);
 
     QI_ComputeQuadrants<TImageSampler> <<< qdrDim , qdrThreads, 0, s->stream >>> 
         (p, initial->data, s->d_quadrants.data, d->d_qiparams, angularSteps);
         
     QI_QuadrantsToProfiles <<< blocks(p.njobs), threads(), 0, s->stream >>> 
         (p, s->d_quadrants.data, s->d_QIprofiles.data, s->d_QIprofiles_reverse.data, d->d_radialweights.data, d->d_qiparams);
     // DbgOutputVectorToFile("D:\\TestImages\\gpu_qi_prof_new.csv", s->d_quadrants, true);
 
 #ifdef QI_DEBUG
     DbgCopyResult(s->d_QIprofiles, cmp_gpu_qi_prof);
 #endif
 
     cufftComplex* prof = (cufftComplex*)s->d_QIprofiles.data;
     cufftComplex* revprof = (cufftComplex*)s->d_QIprofiles_reverse.data;
     CheckCUDAError(cufftExecC2C(s->fftPlan, prof, prof, CUFFT_FORWARD));
     CheckCUDAError(cufftExecC2C(s->fftPlan, revprof, revprof, CUFFT_FORWARD));
 
     int nval = qi_FFT_length * 2 * batchSize;
     int nthread = 256;
     QI_MultiplyWithConjugate<<< dim3( (nval + nthread - 1)/nthread ), dim3(nthread), 0, s->stream >>>(nval, prof, revprof);
     /* 
     Experiment to see the effect of different block sizes.
     Seems that if the whole parameter space is covered, block sizes don't matter for execution speed.
 
     dim3 threadsDim = dim3(16, 16);
     int numBlocks = nval / (threadsDim.x * threadsDim.y);
     dim3 blocksDim = dim3( sqrt((double)numBlocks)+1, sqrt((double)numBlocks)+1 );
         
     if(nval > threadsDim.x * threadsDim.y * blocksDim.x * blocksDim.y){
         printf("\nNot whole space covered: nval > total threads\n");
         printf("nval: %d, nthread: %d, block (%d,%d), threads (%d,%d)\n", nval, nthread, blocksDim.x, blocksDim.y, threadsDim.x, threadsDim.y);
     }
     QI_MultiplyWithConjugate<<< blocksDim, threadsDim, 0, s->stream >>> (nval, prof, revprof);
     */
     
     CheckCUDAError(cufftExecC2C(s->fftPlan, prof, prof, CUFFT_INVERSE));
 
 #ifdef QI_DEBUG
     DbgCopyResult(s->d_QIprofiles, cmp_gpu_qi_fft_out);
 #endif
 
     float2* d_offsets=0;
     float pixelsPerProfLen = (params.maxRadius-params.minRadius)/params.radialSteps;
     dim3 nBlocks=blocks(p.njobs), nThreads=threads();
     QI_OffsetPositions<<<nBlocks, nThreads, 0, s->stream>>>
         (p.njobs, initial->data, newpos->data, prof, qi_FFT_length, d_offsets, pixelsPerProfLen, s->d_shiftbuffer.data); 
 }

§ threads()

dim3 QI::threads ( )

inlineprivate

Same function as QueuedCUDATracker::threads.

Definition at line 155 of file QI.h.

155 { return dim3(numThreads); }

QI::numThreads

int numThreads

See QueuedCUDATracker::numThreads.

Definition: QI.h:150

Member Data Documentation

§ batchSize

int QI::batchSize

private

See QueuedCUDATracker::batchSize. Local copy.

Definition at line 149 of file QI.h.

§ numThreads

int QI::numThreads

private

See QueuedCUDATracker::numThreads.

Definition at line 150 of file QI.h.

§ params

QIParams QI::params

private

Structure with settings relevant to quadrant interpolation.

Definition at line 142 of file QI.h.

§ qi_FFT_length

int QI::qi_FFT_length

private

Parameter for length required for arrays going into FFT. 2 * radial steps.

Used to be nearest power of two, but since switching to cuFFT, this is not needed for speed optimization anymore.

Definition at line 148 of file QI.h.

The documentation for this class was generated from the following files:

cudatrack/QI.h
cudatrack/QI_impl.h

Classes

Public Member Functions

Private Member Functions

Private Attributes

Detailed Description

Member Function Documentation

§ blocks()

§ Execute()

§ Init()

§ InitDevice()

§ InitStream()

§ Iterate()

§ threads()

Member Data Documentation

§ batchSize

§ numThreads

§ params

§ qi_FFT_length