1 #include "cuda_runtime.h" 2 #include "device_launch_parameters.h" 21 #include "../cputrack-test/SharedTests.h" 24 #include <thrust/host_vector.h> 25 #include <thrust/device_vector.h> 26 #include <thrust/generate.h> 27 #include <thrust/reduce.h> 28 #include <thrust/functional.h> 35 #include "ExtractBeadImages.h" 42 int pos = s.length()-1;
43 while (pos>0 && s[pos]!=
'\\' && s[pos]!=
'/' )
46 return s.substr(0, pos);
52 r.x = a.x*b.x + a.y*b.y;
53 r.y = a.y*b.x - a.x*b.y;
58 cudaError_t err = cudaGetLastError();
59 dbgprintf(
"Cuda error: %s\n", cudaGetErrorString(err));
64 __device__
float compute(
int idx,
float* buf,
int s)
68 for (
int x=0;x<s;x++ ){
72 for (
int x=0;x<s/2;x++){
76 for (
int x=s-1;x>=1;x--) {
77 sum += buf[x-1]/(fabsf(buf[x])+0.1f);
83 int idx = threadIdx.x + blockIdx.x * blockDim.x;
85 result [idx] =
compute(idx, &buf [idx * s],s);
90 int idx = threadIdx.x + blockIdx.x * blockDim.x;
99 dim3 nthreads(32), nblocks( (n+nthreads.x-1)/nthreads.x);
104 testWithGlobal<<<nblocks,nthreads>>>(n,s,result_g.
data,buf.
data);
105 cudaDeviceSynchronize();
107 testWithShared <<<nblocks,nthreads,s*
sizeof(float)*nthreads.x>>>(n,s,result_s.data);
108 cudaDeviceSynchronize();
111 std::vector<float> rs = result_s, rg = result_g;
112 for (
int x=0;x<n;x++) {
113 dbgprintf(
"result_s[%d]=%f. result_g[%d]=%f\n", x,rs[x], x,rg[x]);
116 dbgprintf(
"Speed of shared comp: %f, speed of global comp: %f\n", n/(t2-t1), n/(t1-t0));
128 bool haveZLUT =
false;
152 float zmin=0.5,zmax=3;
156 for (
int x=0;x<zplanes;x++) {
158 float s = zmin + (zmax-zmin) * x/(
float)(zplanes-1);
171 if (rc == zplanes)
break;
198 int rc = 0, displayrc=0;
202 for (
int n=0;n<total;n++) {
210 while (displayrc<rc) {
211 if( displayrc%(total/10)==0)
dbgprintf(
"Done: %d / %d\n", displayrc, total);
216 if (cpucmp) qtrkcpu.
Flush();
223 dbgprintf(
"waiting for cpu results..\n");
231 const int NumResults = 20;
233 int rcount = std::min(NumResults,total);
234 for (
int i=0;i<rcount;i++) {
242 for (
int i=0;i<rcount;i++) {
252 dbgprintf(
"Localization Speed: %d (img/s)\n", (
int)( total/(tend-tstart) ));
259 cudaGetDeviceCount(&dc);
260 for (
int k=0;k<dc;k++) {
261 cudaGetDeviceProperties(&prop, k);
262 dbgprintf(
"Device[%d] = %s\n", k, prop.name);
263 dbgprintf(
"\tMax texture width: %d\n" ,prop.maxTexture2D[0]);
269 int idx = blockIdx.x * blockDim.x + threadIdx.x;
271 for (
int x=0;x<1000;x++)
272 a[idx] = asin(a[idx]+x);
289 cudaStreamCreate(&s0);
290 cudaEventCreate(&done,0);
292 for (
int x=0;x<N;x++)
295 for (
int x=0;x<1;x++) {
302 cudaEventRecord(done);
305 MeasureTime(
"sync...");
while (cudaEventQuery(done) != cudaSuccess);
308 cudaStreamDestroy(s0);
309 cudaEventDestroy(done);
322 float zmin=0.5,zmax=3;
326 for (
int x=0;x<zplanes;x++) {
328 float s = zmin + (zmax-zmin) * x/(
float)(zplanes-1);
340 int rc = 0, displayrc=0;
341 double maxScheduleTime = 0.0f;
342 double sumScheduleTime2 = 0.0f;
343 double sumScheduleTime = 0.0f;
345 for (
int n=0;n<count;n++) {
352 maxScheduleTime = std::max(maxScheduleTime, dt);
353 sumScheduleTime += dt;
354 sumScheduleTime2 += dt*dt;
358 while (displayrc<rc) {
359 if( displayrc%(count/10)==0)
dbgprintf(
"Done: %d / %d\n", displayrc, count);
370 float mean = sumScheduleTime / count;
371 float stdev =
sqrt(sumScheduleTime2 / count - mean * mean);
372 dbgprintf(
"Scheduletime: Avg=%f, Max=%f, Stdev=%f\n", mean*1000, maxScheduleTime*1000, stdev*1000);
373 *scheduleTime = mean;
375 return count/(tend-tstart);
383 if ( fabsf(r-v) < fabsf(r/2-v) )
403 int cudaBatchSize = 1024;
446 std::vector<float> values;
448 for (
int roi=20;roi<=180;roi+=10) {
450 values.push_back( roi);
455 const char *labels[] = {
"ROI",
"CPU",
"CUDA" };
466 auto cpu = RunTracker<QueuedCPUTracker> (lutfile, &cfg,
false,
"cpu",
LT_QI);
467 auto gpu = RunTracker<QueuedCUDATracker>(lutfile, &cfg,
false,
"gpu",
LT_QI);
471 for (
int i=0;i<std::min((
int)cpu.output.size(),20);i++) {
472 dbgprintf(
"CPU-GPU: %f, %f\n", cpu.output[i].x-gpu.output[i].x,cpu.output[i].y-gpu.output[i].y);
545 float pos_x = cc.
width/2 - 5;
546 float pos_y = cc.
height/2 + 3;
555 for (
int i=0;i<N;i++)
559 if(i%std::max(1,(
int)(N*0.1))==0)
dbgprintf(
"Queued: %d / %d\n", i, N);
569 if( res.
pos.
x > pos_x + 0.01f || res.
pos.
x < pos_x - 0.01f || res.
pos.
y > pos_y + 0.01f || res.
pos.
y < pos_y - 0.01f ){
576 dbgprintf(
"Errors: %d/%d (%f%%)\n", count, N, (
float)100*count/N);
592 float pos_x = cc.
width/2 - 5;
593 float pos_y = cc.
height/2 + 3;
611 std::vector<std::string> colnames;
613 colnames.push_back(
SPrintf(
"%d",ii));
620 SPrintf(
"%s\\RMFrameInfo.txt",output.
folder.c_str()).c_str(),
625 for (
int i=0;i<N;i++)
631 printf(
"\nDone queueing!\n");
636 while(RM.GetFrameCounters().localizationsDone < N);
641 while(RM.GetFrameCounters().lastSaveFrame != N);
654 std::vector<vector3f> rcpu = Gauss2DTest<QueuedCPUTracker>(N, R, calib);
655 std::vector<vector3f> rgpu = Gauss2DTest<QueuedCUDATracker>(N, R, calib);
657 for (
int i=0;i<std::min(20,N);i++) {
658 dbgprintf(
"[%d] CPU: X:%.5f, Y:%.5f\t;\tGPU: X:%.5f, Y:%.5f. \tDiff: X:%.5f, Y:%.5f\n",
659 i, rcpu[i].x, rcpu[i].y, rgpu[i].x, rgpu[i].y, rcpu[i].x-rgpu[i].x, rcpu[i].y-rgpu[i].y);
691 for (
int i=0;i<N;i++) {
694 pos.
x += rand_uniform<float>();
695 pos.
y += rand_uniform<float>();
708 for (
int i=0;i<N;i++) {
710 dbgprintf(
"[%d]: CPU: x=%f, y=%f. GPU: x=%f, y=%f.\tGPU-CPU: x:%f, y:%f\n", i, rcpu[i].x, rcpu[i].y, rgpu[i].x, rgpu[i].y, d.
x,d.
y);
714 for(
uint i=0;i<cmp_cpu_qi_prof.size();i++) {
715 dbgprintf(
"QIPROF[%d]. CPU=%f, GPU=%f, Diff: %f\n", i, cmp_cpu_qi_prof[i], cmp_gpu_qi_prof[i], cmp_gpu_qi_prof[i]-cmp_cpu_qi_prof[i]);
718 for(
uint i=0;i<cmp_cpu_qi_fft_out.size();i++) {
719 dbgprintf(
"fft-out[%d]. CPU=%f, GPU=%f, Diff: %f\n", i, cmp_cpu_qi_fft_out[i].real(), cmp_gpu_qi_fft_out[i].real(), cmp_gpu_qi_fft_out[i].real()-cmp_cpu_qi_fft_out[i].real());
743 void check_arg(
const std::vector<std::string>& args,
const char *name, T *param)
745 for (
uint i=0;i<args.size();i++) {
746 if (args[i] == name) {
747 *param = (T)atof(args[i+1].c_str());
753 void check_strarg(
const std::vector<std::string>& args,
const char *name, std::string* param)
755 for (
uint i=0;i<args.size();i++) {
756 if (args[i] == name) {
766 std::vector<std::string> args(argc-1);
767 for (
int i=0;i<argc-1;i++)
776 std::string outputfile, fixlutfile, inputposfile, bmlutfile, rescaledlutfile;
777 std::string radialWeightsFile;
785 std::string crlboutput;
788 std::vector< vector3f > inputPos;
789 if (!inputposfile.empty()) {
791 count = inputPos.size();
808 check_arg(args,
"zlutalign", &zlutAlign);
810 float pixelmax = 28 * 255;
813 std::string lutsmpfile;
826 if (!fixlutfile.empty())
830 if(!rescaledlutfile.empty()) {
833 ResampleLUT(qtrk, &lut, lut.
h, &newlut, rescaledlutfile.c_str());
848 if (bmlutfile.empty()) {
854 bmlut.
Load(bmlutfile.c_str());
858 if (!rescaledlutfile.empty())
864 if (inputPos.empty()) {
865 inputPos.resize(count);
866 for (
int i=0;i<count;i++){
871 if (!radialWeightsFile.empty())
873 auto rwd =
ReadCSV(radialWeightsFile.c_str());
874 std::vector<float> rw(rwd.size());
883 std::vector<ImageData> imgs (inputPos.size());
885 std::vector<vector3f> crlb(inputPos.size());
887 for (
uint i=0;i<inputPos.size();i++) {
891 auto p = inputPos[i];
894 if (!crlboutput.empty()) {
902 if(i==0 && !lutsmpfile.empty())
WriteJPEGFile(lutsmpfile.c_str(), imgs[i]);
914 for (
uint i=0;i<inputPos.size();i++)
923 std::vector<vector3f> results(inputPos.size());
924 for (
uint i=0;i<inputPos.size();i++) {
931 dbgprintf(
"Mean err X=%f,Z=%f. St deviation: X=%f,Z=%f\n", meanErr.
x,meanErr.
y,stdevErr.
x,stdevErr.
z);
933 if (!crlboutput.empty())
934 WriteTrace(crlboutput, &crlb[0], crlb.size());
936 WriteTrace(outputfile, &results[0], inputPos.size());
947 std::vector<BeadPos> beads = read_beadlist(
SPrintf(
"%sbeadlist.txt",folder.c_str()));
950 int numImgInStack = 1218;
951 int numPositions = 1001;
953 float umPerImg = range/numImgInStack;
983 int pxPerBead = ROISize*ROISize;
984 int memSizePerBead = pxPerBead*
sizeof(float);
985 int startFrame = 400;
986 for(
int plane = 0; plane < zplanes; plane++){
988 int frameNum = startFrame+(int)(numImgInStack-startFrame)*((float)plane/zplanes);
989 std::string file =
SPrintf(
"%s\img%05d.jpg",folder.c_str(),frameNum);
993 float* data =
new float[beads.size()*pxPerBead];
995 for(
uint ii = 0; ii < beads.size(); ii++){
997 pos.
x = beads.at(ii).x - ROISize/2;
998 pos.
y = beads.at(ii).y - ROISize/2;
1001 memcpy(data+ii*pxPerBead,crop.
data,memSizePerBead);
1023 for(
int ii = 0; ii < beads.size(); ii++){
1040 printf(
"%d, %d\n",
sizeof(
long),
sizeof(
int));
1086 }
catch (
const std::exception& e) {
int GetResultCount() override
Get the number of finished localization jobs (=results) available in memory.
std::vector< std::complex< float > > cmp_gpu_qi_fft_out
void GenerateLUT(ImageData *lut)
__global__ void SimpleKernel(int N, float *a)
int CmdLineRun(int argc, char *argv[])
virtual int GetResultCount()=0
Get the number of finished localization jobs (=results) available in memory.
float zlut_minradius
Distance in pixels from the bead center from which to start sampling profiles. Default 1...
virtual int ScheduleFrame(void *imgptr, int pitch, int width, int height, ROIPosition *positions, int numROI, QTRK_PixelDataType pdt, const LocalizationJob *jobInfo)
Schedule an entire frame at once, allowing for further optimizations.
virtual void BuildLUT(void *data, int pitch, QTRK_PixelDataType pdt, int plane, vector2f *known_pos=0)=0
Add a new lookup table plane.
void GenerateTestImage(ImageData &img, float xp, float yp, float size, float SNratio)
void ScheduleLocalization(void *data, int pitch, QTRK_PixelDataType pdt, const LocalizationJob *jobInfo) override
Add a job to the queue to be processed. A job entails running the required algorithms on a single reg...
void GenerateSample(ImageData *image, vector3f pos, float minRadius, float maxRadius)
__global__ void emptyKernel()
float SpeedTest(const QTrkSettings &cfg, QueuedTracker *qtrk, int count, bool haveZLUT, LocMode_t locType, float *scheduleTime, bool gaincorrection=false)
int numBeads
Number of beads for which to grab results. Should always equal the amount of beads in a single frame...
static double WaitForFinish(QueuedTracker *qtrk, int N)
void Flush() override
Stop waiting for more jobs to do, and just process the current batch.
void copyToHost(T *dst, bool async, cudaStream_t s=0)
void FinalizeLUT() override
Finalize the lookup tables in memory.
SpeedInfo SpeedCompareTest(int w, LocalizeModeEnum locMode, bool haveZLUT, int qi_iterations=5)
void SetLocalizationMode(LocMode_t locType) override
Select which algorithm is to be used.
void outputImage(ImageData img, std::string filename="UsedImage")
float qi_minradius
Distance in pixels from the bead center from which to start sampling profiles. Default 1...
int width
Width of regions of interest to be handled. Typically equals height (square ROI). ...
vector3f scaling
Scaling factor for each of the three dimensions.
std::vector< float > cmp_cpu_qi_prof
uint maxFramesInMemory
Number of frames for which to keep the data in memory. 0 for infinite.
__device__ float2 mul_conjugate(float2 a, float2 b)
ImageData DebugImage(int ID)
Get the debug image as an ImageData object.
static void EnableGainCorrection(QueuedTracker *qtrk)
void QICompare(const char *lutfile)
void BuildLUT(void *data, int pitch, QTRK_PixelDataType pdt, int plane, vector2f *known_pos=0) override
Add a new lookup table plane.
void outputString(std::string out, bool ConsoleOnly=false)
void ResampleLUT(T *qtrk, ImageData *lut, int zplanes, ImageData *newlut, const char *jpgfile=0, uint buildLUTFlags=0)
int downsample
Image downsampling factor. Applied before anything else. 0 = original, 1 = 1x (W=W/2,H=H/2).
Struct used to define the top-left corner position of an ROI within a frame. ROI is [ x ...
int zlutIndex
Bead number of this ROI. Used to get the right ZLUT from memory.
std::vector< std::vector< float > > ReadCSV(const char *filename, char sep)
void Flush() override
Stop waiting for more jobs to do, and just process the current batch.
vector3f offset
Offset value for each of the three dimensions.
void ClearResults() override
Clear results.
void GenerateImageFromLUT(ImageData *image, ImageData *zlut, float minradius, float maxradius, vector3f pos, bool splineInterp, int oversampleSubdiv)
std::vector< std::complex< float > > cmp_cpu_qi_fft_out
void WriteJPEGFile(uchar *data, int w, int h, const char *filename, int quality)
int FetchResults(LocalizationResult *dstResult, int maxResults) override
Fetch available results.
Structure for job results.
__device__ float compute(int idx, float *buf, int s)
int height
Height of regions of interest to be handled. Typically equals width (square ROI). ...
void GetRadialZLUT(float *data) override
Get the radial lookup tables used for z tracking.
void WriteTrace(std::string filename, vector3f *results, int nResults)
int GetResultCount() override
Get the number of finished localization jobs (=results) available in memory.
int NearestPowerOfTwo(int v)
void ClearResults() override
Clear results.
Structure for the settings used by the algorithms implemented in QueuedTracker.
int zlut_radialsteps
Number of radial steps to sample on.
vector3< T > sqrt(const vector3< T > &a)
__global__ void testWithGlobal(int n, int s, float *result, float *buf)
std::vector< vector3f > ReadVector3CSV(const char *file, char sep)
virtual void FinalizeLUT()=0
Finalize the lookup tables in memory.
Class that handles data gathering and saving from QueuedTracker instances.
std::string getPath(const char *file)
virtual void SetLocalizationMode(LocMode_t locType)=0
Select which algorithm is to be used.
uint frame
Frame number this ROI belongs to.
int numFrameInfoColumns
Number of columns in the frame info metadata file. Additional columns can be added to save more data ...
static TImageData alloc(int w, int h)
LocalizeModeEnum
Flags for selecting localization type.
virtual void SetRadialZLUT(float *data, int count, int planes)=0
Set the radial lookup tables to be used for z tracking.
__global__ void testWithShared(int n, int s, float *result)
Matrix3X3 Inverse() const
int SmallestPowerOfTwo(int minval)
void Update()
Compute the derived settings.
void FinalizeLUT() override
Finalize the lookup tables in memory.
void EnableTextureCache(bool useTextureCache)
Enable or disable the use of textures.
vector3f pos
Final 3D position found. If no z localization was performed, the value of z will be 0...
int FetchResults(LocalizationResult *results, int maxResults) override
Fetch available results.
float com_bgcorrection
Background correction factor for COM. Defines the number of standard deviations data needs to be away...
void TestGauss2D(bool calib)
void copyToDevice(const std::vector< T > &src, bool async=false, cudaStream_t s=0)
std::string GetProfileReport() override
Get the output of performance profiling.
ImageData CropImage(ImageData img, int x, int y, int w, int h)
__shared__ float cudaSharedMem[]
float qi_maxradius
Max radius in pixels of the sampling circle.
Normalize found radial profiles.
void SetRadialZLUT(float *data, int num_zluts, int planes) override
Set the radial lookup tables to be used for z tracking.
void BuildZLUT(std::string folder, outputter *output)
LocalizationJob job
Job metadata. See LocalizationJob.
CPU implementation of the QueuedTracker interface.
Structure for settings used by ResultManager.
QTrkComputedConfig cfg
The settings used by this instance of QueuedTracker.
void SetTracker(QueuedTracker *qtrk)
Set the tracker from which to fetch results.
void BeginLUT(uint flags) override
Setup to begin building a lookup table.
void SetLocalizationMode(LocMode_t lt) override
Select which algorithm is to be used.
int ReadJPEGFile(uchar *srcbuf, int srclen, uchar **data, int *width, int *height)
float zlut_roi_coverage
Factor of the ROI to include in sampling. Between 0 and 1, default 1. Maxradius = ROI/2*roi_coverage...
void TestRadialLUTGradientMethod()
static void MeanStDevError(const std::vector< vector3f > &truepos, const std::vector< vector3f > &v, vector3f &meanErr, vector3f &stdev)
int cuda_device
CUDA only. Flag for device selection.
void ProfileSpeedVsROI(LocalizeModeEnum locMode, const char *outputcsv, bool haveZLUT, int qi_iterations)
float qi_radial_coverage
Sampling points per radial pixel. Default 3.0.
Structure for derived settings computed from base settings in QTrkSettings.
float zlut_angular_coverage
Factor of the sampling perimeter to cover with angular sampling steps. Between 0 and 1...
int qi_radialsteps
Number of radial steps to sample on.
static std::vector< vector3f > FetchResults(QueuedTracker *trk)
void BuildLUT(void *data, int pitch, QTRK_PixelDataType pdt, int plane, vector2f *known_pos=0) override
Add a new lookup table plane.
vector2f firstGuess
(x,y) position found by the COM localization. Used as initial position for the subsequent algorithms...
void check_arg(const std::vector< std::string > &args, const char *name, T *param)
void SetRadialZLUT(float *data, int numLUTs, int planes) override
Set the radial lookup tables to be used for z tracking.
float qi_roi_coverage
Factor of the ROI to include in sampling. Between 0 and 1, default 1. Maxradius = ROI/2*roi_coverage...
vector3< float > vector3f
void dbgprintf(const char *fmt,...)
void check_strarg(const std::vector< std::string > &args, const char *name, std::string *param)
int qi_iterations
Number of times to run the QI algorithm, sampling around the last found position. ...
CUDA implementation of the QueuedTracker interface.
std::vector< float > cmp_gpu_qi_prof
int xc1_iterations
Number of times to run the cross correlation algorithm.
int writeInterval
Interval of number of gathered frames at which to write the data.
float qi_angstep_factor
Factor to reduce angular steps on lower iterations. Default 1.0 (no effect).
void FloatToJPEGFile(const char *name, const float *d, int w, int h)
void ScheduleLocalization(void *data, int pitch, QTRK_PixelDataType pdt, const LocalizationJob *jobInfo) override
Add a job to the queue to be processed. A job entails running the required algorithms on a single reg...
uint8_t binaryOutput
Flag (boolean) to output a binary file instead of a text file.
int xc1_profileWidth
Profile width for the cross correlation.
void Load(ImageData *lut)
int main(int argc, char *argv[])
virtual int FetchResults(LocalizationResult *results, int maxResults)=0
Fetch available results.
void WriteToFile()
Write all settings to specified output file (Jordi, to combine with QTrkSettings.testRun) ...
virtual void SetRadialWeights(float *zcmp)=0
Set radial weights used for comparing LUT profiles.
virtual void ClearResults()=0
Clear results.
void WriteImageAsCSV(const char *file, float *d, int w, int h, const char *labels[])
Abstract tracker interface, implemented by QueuedCUDATracker and QueuedCPUTracker.
void ScheduleImageData(ImageData *data, const LocalizationJob *jobInfo)
Quick function to schedule a single ROI from an ImageData object.
Matrix3X3 Compute(vector3f pos, vector3f delta, ImageData &lut, int w, int h, float zlutMinRadius, float zlutMaxRadius)
void GetRadialZLUT(float *zlut) override
Get the radial lookup tables used for z tracking.
int numThreads
Number of threads/streams to use. Defaults differ between CPU and GPU implementations.
float zlut_radial_coverage
Sampling points per radial pixel. Default 3.0.
float qi_angular_coverage
Factor of the sampling perimeter to cover with angular sampling steps. Between 0 and 1...
Structure for region of interest metadata.
int xc1_profileLength
Profile length for the cross correlation.
float zlut_maxradius
Max radius in pixels of the sampling circle.
void CompareAccuracy(const char *lutfile)
std::string SPrintf(const char *fmt,...)
Do a ZLUT lookup with adjusted weights, for testing purposes.
void ApplyPoissonNoise(ImageData &img, float poissonMax, float maxval)