Perform one QI iteration.
Huge speedup achieved by calculating each quadrant's radial bin in a seperate thread. This leads to more, smaller kernels running concurrently so more memory latency hiding can be achieved through higher occupancy. For a visual display of this, try running the NVidia Visual Profiler with both versions of quadrant profile kernels.
294 dim3 qdrThreads(16, 8, 4);
295 dim3 qdrDim( (p.
njobs + qdrThreads.x - 1) / qdrThreads.x, (
params.
radialSteps + qdrThreads.y - 1) / qdrThreads.y);
297 QI_ComputeQuadrants<TImageSampler> <<< qdrDim , qdrThreads, 0, s->stream >>>
298 (p, initial->
data, s->d_quadrants.data, d->d_qiparams, angularSteps);
301 (p, s->d_quadrants.data, s->d_QIprofiles.data, s->d_QIprofiles_reverse.data, d->d_radialweights.data, d->d_qiparams);
308 cufftComplex* prof = (cufftComplex*)s->d_QIprofiles.data;
309 cufftComplex* revprof = (cufftComplex*)s->d_QIprofiles_reverse.data;
310 CheckCUDAError(cufftExecC2C(s->fftPlan, prof, prof, CUFFT_FORWARD));
311 CheckCUDAError(cufftExecC2C(s->fftPlan, revprof, revprof, CUFFT_FORWARD));
315 QI_MultiplyWithConjugate<<< dim3( (nval + nthread - 1)/nthread ), dim3(nthread), 0, s->stream >>>(nval, prof, revprof);
331 CheckCUDAError(cufftExecC2C(s->fftPlan, prof, prof, CUFFT_INVERSE));
341 (p.
njobs, initial->
data, newpos->data, prof,
qi_FFT_length, d_offsets, pixelsPerProfLen, s->d_shiftbuffer.data);
std::vector< std::complex< float > > cmp_gpu_qi_fft_out
int qi_FFT_length
Parameter for length required for arrays going into FFT. 2 * radial steps.
__global__ void QI_QuadrantsToProfiles(BaseKernelParams kp, float *quadrants, float2 *profiles, float2 *reverseProfiles, float *d_radialweights, const QIParams *params)
QIParams params
Structure with settings relevant to quadrant interpolation.
void CheckCUDAError(cufftResult_t err)
__global__ void QI_MultiplyWithConjugate(int n, cufftComplex *a, cufftComplex *b)
dim3 threads()
Same function as QueuedCUDATracker::threads.
__global__ void QI_OffsetPositions(int njobs, float3 *current, float3 *dst, cufftComplex *autoconv, int fftLength, float2 *offsets, float pixelsPerProfLen, float *shiftbuf)
int batchSize
See QueuedCUDATracker::batchSize. Local copy.
void DbgCopyResult(device_vec< float2 > src, std::vector< std::complex< float > > &dst)
std::vector< float > cmp_gpu_qi_prof
int njobs
Number of jobs in the batch.
dim3 blocks(int njobs)
Same function as QueuedCUDATracker::blocks.