Non HD5xxx kernels: __kernel void FindAutoCorrelation_reduce0_kernel1_cl(__global float2* AutoCorrelation, __global float* PartialSumMaxIdx){ const int tid=get_global_id(0); const y=get_global_id(1)*(4*128*1024); float partial_sum=0.0f; float partial_max=0.0; float idx=0.0; __global float2* auto_corr=AutoCorrelation+y; for(int i=tid*256;i<(tid+1)*256;i++){ float tmp=auto_corr.x; tmp*=tmp; partial_sum+=tmp; if(i!=0 && tmp>partial_max){ partial_max=tmp; idx=(float)i; } } int did=3*(256*get_global_id(1)+tid); PartialSumMaxIdx[did]=partial_sum; PartialSumMaxIdx[did+1]=partial_max; PartialSumMaxIdx[did+2]=idx; } __kernel void FindAutoCorrelation_reduce1_kernel_cl(__global float* PartialSumMaxIdx, __global float2* result){ const sid=256*get_global_id(0);//only 8 workitems here float full_max=0.0f; float idx=0.0f; float Sum=0.0f; for(int i=0;i<256;i++){ Sum+=PartialSumMaxIdx[3*(sid+i)]; float tmp=PartialSumMaxIdx[(sid+i)*3+1]; if(tmp>full_max){ full_max=tmp; idx=PartialSumMaxIdx[(sid+i)*3+2]; } } result[get_global_id(0)].x=full_max*(128.0f*1024.0f)/Sum;//peak value result[get_global_id(0)].y=idx;//bin } host code: {size_t localThreads[2]={256,1}; size_t globalThreads[2]={256,8};//R: hardwired number of FFTs for now int i=0; err = clSetKernelArg(FindAutoCorrelation_reduce0_kernel_cl,i++,sizeof(cl_mem),(void *)&gpu_WorkData); err |= clSetKernelArg(FindAutoCorrelation_reduce0_kernel_cl,i++,sizeof(cl_mem),(void *)&FFTbuf); if(err) fprintf(stderr,"ERROR: Setting kernel argument:FindAutoCorrelation_reduce0_kernel_cl: %d\n",err); err = clEnqueueNDRangeKernel(cq,FindAutoCorrelation_reduce0_kernel_cl, 2, NULL,globalThreads, localThreads, 0,NULL,NULL); if(err) fprintf(stderr,"ERROR: FindAutoCorrelation_reduce0_kernel_cl: %d\n",err); } { int i=0; size_t localThreads[1]={1}; size_t globalThreads[1]={8};//R: hardwired number of FFTs for now err = clSetKernelArg(FindAutoCorrelation_reduce1_kernel_cl,i++,sizeof(cl_mem),(void *)&FFTbuf); err |= clSetKernelArg(FindAutoCorrelation_reduce1_kernel_cl,i++,sizeof(cl_mem),(void *)&gpu_AutoCorrelationResults); if(err) fprintf(stderr,"ERROR: Setting kernel argument:FindAutoCorrelation_reduce1_kernel_cl: %d\n",err); err = clEnqueueNDRangeKernel(cq,FindAutoCorrelation_reduce1_kernel_cl, 1, NULL,globalThreads, localThreads, 0,NULL,NULL); if(err) fprintf(stderr,"ERROR: FindAutoCorrelation_reduce1_kernel_cl: %d\n",err); } HD5xxx kernel: __kernel void FindAutoCorrelation_kernel1_cl(__global float2* AutoCorrelation, __global float2* result){ __local float* PartialSumMaxIdx[256*3]; const int tid=get_local_id(0); const y=get_global_id(1)*(4*128*1024); float partial_sum=0.0f; float partial_max=0.0; float idx=0.0; __global float2* auto_corr=AutoCorrelation+y; for(int i=tid*256;i<(tid+1)*256;i++){ float tmp=auto_corr.x; tmp*=tmp; partial_sum+=tmp; if(i!=0 && tmp>partial_max){ partial_max=tmp; idx=(float)i; } } int did=3*tid; PartialSumMaxIdx[did]=partial_sum; PartialSumMaxIdx[did+1]=partial_max; PartialSumMaxIdx[did+2]=idx; if(tid==0){ float full_max=0.0f; float idx=0.0f; float Sum=0.0f; for(int i=0;i<256;i++){ Sum+=PartialSumMaxIdx[3*i]; if(PartialSumMaxIdx[i*3+1]>full_max){ full_max=PartialSumMaxIdx[i*3+1]; idx=PartialSumMaxIdx[i*3+2]; } } result[get_global_id(1)].x=full_max*(128.0f*1024.0f)/Sum;//peak value result[get_global_id(1)].y=idx;//bin } } host code: {size_t localThreads[2]={256,1}; size_t globalThreads[2]={256,8};//R: hardwired number of FFTs for now int i=0; err = clSetKernelArg(FindAutoCorrelation_kernel1_cl,i++,sizeof(cl_mem),(void *)&gpu_WorkData); err |= clSetKernelArg(FindAutoCorrelation_kernel1_cl,i++,sizeof(cl_mem),(void *)&gpu_AutoCorrelationResults); if(err) fprintf(stderr,"ERROR: Setting kernel argument:FindAutoCorrelation_kernel1_cl: %d\n",err); err = clEnqueueNDRangeKernel(cq,FindAutoCorrelation_kernel1_cl, 2, NULL,globalThreads, localThreads, 0,NULL,NULL); if(err) fprintf(stderr,"ERROR: FindAutoCorrelation_reduce0_kernel_cl: %d\n",err); } profiler data: Method ExecutionOrder GlobalWorkSize GroupWorkSize Time LocalMemSize DataTransferSize GPRs ScratchRegs FCStacks Wavefronts ALUInsts FetchInsts WriteInsts LDSFetchInsts LDSWriteInsts ALUBusy ALUFetchRatio ALUPacking FetchSize CacheHit FetchUnitBusy FetchUnitStalled WriteUnitStalled FastPath CompletePath PathUtilization LDSBankConflict FindAutoCorrelation_reduce0_kernel1_cl__k53_Cypress1 259 { 256 8 1} { 256 1 1} 0,39588 0 11 0 1 32,00 1106,00 256,00 1,00 0,00 0,00 1,90 4,32 51,32 4096,00 0,00 0,00 0,00 0,00 23,00 0,00 100,00 0,00 FindAutoCorrelation_kernel1_cl__k53_Cypress1 259 { 256 8 1} { 256 1 1} 1,87470 3072 8 0 2 32,00 2265,50 256,50 0,50 96,25 2,00 1,05 8,83 35,99 16385,00 0,00 0,00 0,00 0,00 0,00 0,00 0,00 0,00
Originally posted by: MicahVillmow
I know there were some performance problems on SDK 2.3 that were fixed in SDK 2.4 and further improved in the upcoming SDK 2.5(you can get a pre-release from a beta driver for the debugger). Is there any specific reason why you are not able to upgrade?
Originally posted by: MicahVillmow
Also, have you tried 11.5? This should be better behaved than 11.6.
To the end of this month must go out
well APP 2.5 is part of Windows catalyst 11.7. wonder when we will see APP SDK 2.5 for windows/linux.