Non HD5xxx kernels: __kernel void FindAutoCorrelation_reduce0_kernel1_cl(__global float2* AutoCorrelation, __global float* PartialSumMaxIdx){ const int tid=get_global_id(0); const y=get_global_id(1)*(4*128*1024); float partial_sum=0.0f; float partial_max=0.0; float idx=0.0; __global float2* auto_corr=AutoCorrelation+y; for(int i=tid*256;i<(tid+1)*256;i++){ float tmp=auto_corr.x; tmp*=tmp; partial_sum+=tmp; if(i!=0 && tmp>partial_max){ partial_max=tmp; idx=(float)i; } } int did=3*(256*get_global_id(1)+tid); PartialSumMaxIdx[did]=partial_sum; PartialSumMaxIdx[did+1]=partial_max; PartialSumMaxIdx[did+2]=idx; } __kernel void FindAutoCorrelation_reduce1_kernel_cl(__global float* PartialSumMaxIdx, __global float2* result){ const sid=256*get_global_id(0);//only 8 workitems here float full_max=0.0f; float idx=0.0f; float Sum=0.0f; for(int i=0;i<256;i++){ Sum+=PartialSumMaxIdx[3*(sid+i)]; float tmp=PartialSumMaxIdx[(sid+i)*3+1]; if(tmp>full_max){ full_max=tmp; idx=PartialSumMaxIdx[(sid+i)*3+2]; } } result[get_global_id(0)].x=full_max*(128.0f*1024.0f)/Sum;//peak value result[get_global_id(0)].y=idx;//bin } host code: {size_t localThreads[2]={256,1}; size_t globalThreads[2]={256,8};//R: hardwired number of FFTs for now int i=0; err = clSetKernelArg(FindAutoCorrelation_reduce0_kernel_cl,i++,sizeof(cl_mem),(void *)&gpu_WorkData); err |= clSetKernelArg(FindAutoCorrelation_reduce0_kernel_cl,i++,sizeof(cl_mem),(void *)&FFTbuf); if(err) fprintf(stderr,"ERROR: Setting kernel argument:FindAutoCorrelation_reduce0_kernel_cl: %d\n",err); err = clEnqueueNDRangeKernel(cq,FindAutoCorrelation_reduce0_kernel_cl, 2, NULL,globalThreads, localThreads, 0,NULL,NULL); if(err) fprintf(stderr,"ERROR: FindAutoCorrelation_reduce0_kernel_cl: %d\n",err); } { int i=0; size_t localThreads[1]={1}; size_t globalThreads[1]={8};//R: hardwired number of FFTs for now err = clSetKernelArg(FindAutoCorrelation_reduce1_kernel_cl,i++,sizeof(cl_mem),(void *)&FFTbuf); err |= clSetKernelArg(FindAutoCorrelation_reduce1_kernel_cl,i++,sizeof(cl_mem),(void *)&gpu_AutoCorrelationResults); if(err) fprintf(stderr,"ERROR: Setting kernel argument:FindAutoCorrelation_reduce1_kernel_cl: %d\n",err); err = clEnqueueNDRangeKernel(cq,FindAutoCorrelation_reduce1_kernel_cl, 1, NULL,globalThreads, localThreads, 0,NULL,NULL); if(err) fprintf(stderr,"ERROR: FindAutoCorrelation_reduce1_kernel_cl: %d\n",err); } HD5xxx kernel: __kernel void FindAutoCorrelation_kernel1_cl(__global float2* AutoCorrelation, __global float2* result){ __local float* PartialSumMaxIdx[256*3]; const int tid=get_local_id(0); const y=get_global_id(1)*(4*128*1024); float partial_sum=0.0f; float partial_max=0.0; float idx=0.0; __global float2* auto_corr=AutoCorrelation+y; for(int i=tid*256;i<(tid+1)*256;i++){ float tmp=auto_corr.x; tmp*=tmp; partial_sum+=tmp; if(i!=0 && tmp>partial_max){ partial_max=tmp; idx=(float)i; } } int did=3*tid; PartialSumMaxIdx[did]=partial_sum; PartialSumMaxIdx[did+1]=partial_max; PartialSumMaxIdx[did+2]=idx; if(tid==0){ float full_max=0.0f; float idx=0.0f; float Sum=0.0f; for(int i=0;i<256;i++){ Sum+=PartialSumMaxIdx[3*i]; if(PartialSumMaxIdx[i*3+1]>full_max){ full_max=PartialSumMaxIdx[i*3+1]; idx=PartialSumMaxIdx[i*3+2]; } } result[get_global_id(1)].x=full_max*(128.0f*1024.0f)/Sum;//peak value result[get_global_id(1)].y=idx;//bin } } host code: {size_t localThreads[2]={256,1}; size_t globalThreads[2]={256,8};//R: hardwired number of FFTs for now int i=0; err = clSetKernelArg(FindAutoCorrelation_kernel1_cl,i++,sizeof(cl_mem),(void *)&gpu_WorkData); err |= clSetKernelArg(FindAutoCorrelation_kernel1_cl,i++,sizeof(cl_mem),(void *)&gpu_AutoCorrelationResults); if(err) fprintf(stderr,"ERROR: Setting kernel argument:FindAutoCorrelation_kernel1_cl: %d\n",err); err = clEnqueueNDRangeKernel(cq,FindAutoCorrelation_kernel1_cl, 2, NULL,globalThreads, localThreads, 0,NULL,NULL); if(err) fprintf(stderr,"ERROR: FindAutoCorrelation_reduce0_kernel_cl: %d\n",err); } profiler data: Method ExecutionOrder GlobalWorkSize GroupWorkSize Time LocalMemSize DataTransferSize GPRs ScratchRegs FCStacks Wavefronts ALUInsts FetchInsts WriteInsts LDSFetchInsts LDSWriteInsts ALUBusy ALUFetchRatio ALUPacking FetchSize CacheHit FetchUnitBusy FetchUnitStalled WriteUnitStalled FastPath CompletePath PathUtilization LDSBankConflict FindAutoCorrelation_reduce0_kernel1_cl__k53_Cypress1 259 { 256 8 1} { 256 1 1} 0,39588 0 11 0 1 32,00 1106,00 256,00 1,00 0,00 0,00 1,90 4,32 51,32 4096,00 0,00 0,00 0,00 0,00 23,00 0,00 100,00 0,00 FindAutoCorrelation_kernel1_cl__k53_Cypress1 259 { 256 8 1} { 256 1 1} 1,87470 3072 8 0 2 32,00 2265,50 256,50 0,50 96,25 2,00 1,05 8,83 35,99 16385,00 0,00 0,00 0,00 0,00 0,00 0,00 0,00 0,00