Archives Discussions

Raistmer · ‎07-10-2011

fetched size differs 4 times between 2 similar kernels - why?

I wrote 2 kernels for HD4xxx GPUs w/o local memory and anothe one that uses local memory.
It happened to be slower. When I looked through profiler data I noticed significant discrepance in FetchSize column. They differs in 4 times! But number of fetch instructions is the same.

Why? Please, look on kernels, host code and profiler reports....
(also, number of ALU instructions differs too much IMHO too)

Non HD5xxx kernels: __kernel void FindAutoCorrelation_reduce0_kernel1_cl(__global float2* AutoCorrelation, __global float* PartialSumMaxIdx){ const int tid=get_global_id(0); const y=get_global_id(1)*(4*128*1024); float partial_sum=0.0f; float partial_max=0.0; float idx=0.0; __global float2* auto_corr=AutoCorrelation+y; for(int i=tid*256;i<(tid+1)*256;i++){ float tmp=auto_corr.x; tmp*=tmp; partial_sum+=tmp; if(i!=0 && tmp>partial_max){ partial_max=tmp; idx=(float)i; } } int did=3*(256*get_global_id(1)+tid); PartialSumMaxIdx[did]=partial_sum; PartialSumMaxIdx[did+1]=partial_max; PartialSumMaxIdx[did+2]=idx; } __kernel void FindAutoCorrelation_reduce1_kernel_cl(__global float* PartialSumMaxIdx, __global float2* result){ const sid=256*get_global_id(0);//only 8 workitems here float full_max=0.0f; float idx=0.0f; float Sum=0.0f; for(int i=0;i<256;i++){ Sum+=PartialSumMaxIdx[3*(sid+i)]; float tmp=PartialSumMaxIdx[(sid+i)*3+1]; if(tmp>full_max){ full_max=tmp; idx=PartialSumMaxIdx[(sid+i)*3+2]; } } result[get_global_id(0)].x=full_max*(128.0f*1024.0f)/Sum;//peak value result[get_global_id(0)].y=idx;//bin } host code: {size_t localThreads[2]={256,1}; size_t globalThreads[2]={256,8};//R: hardwired number of FFTs for now int i=0; err = clSetKernelArg(FindAutoCorrelation_reduce0_kernel_cl,i++,sizeof(cl_mem),(void *)&gpu_WorkData); err |= clSetKernelArg(FindAutoCorrelation_reduce0_kernel_cl,i++,sizeof(cl_mem),(void *)&FFTbuf); if(err) fprintf(stderr,"ERROR: Setting kernel argument:FindAutoCorrelation_reduce0_kernel_cl: %d\n",err); err = clEnqueueNDRangeKernel(cq,FindAutoCorrelation_reduce0_kernel_cl, 2, NULL,globalThreads, localThreads, 0,NULL,NULL); if(err) fprintf(stderr,"ERROR: FindAutoCorrelation_reduce0_kernel_cl: %d\n",err); } { int i=0; size_t localThreads[1]={1}; size_t globalThreads[1]={8};//R: hardwired number of FFTs for now err = clSetKernelArg(FindAutoCorrelation_reduce1_kernel_cl,i++,sizeof(cl_mem),(void *)&FFTbuf); err |= clSetKernelArg(FindAutoCorrelation_reduce1_kernel_cl,i++,sizeof(cl_mem),(void *)&gpu_AutoCorrelationResults); if(err) fprintf(stderr,"ERROR: Setting kernel argument:FindAutoCorrelation_reduce1_kernel_cl: %d\n",err); err = clEnqueueNDRangeKernel(cq,FindAutoCorrelation_reduce1_kernel_cl, 1, NULL,globalThreads, localThreads, 0,NULL,NULL); if(err) fprintf(stderr,"ERROR: FindAutoCorrelation_reduce1_kernel_cl: %d\n",err); } HD5xxx kernel: __kernel void FindAutoCorrelation_kernel1_cl(__global float2* AutoCorrelation, __global float2* result){ __local float* PartialSumMaxIdx[256*3]; const int tid=get_local_id(0); const y=get_global_id(1)*(4*128*1024); float partial_sum=0.0f; float partial_max=0.0; float idx=0.0; __global float2* auto_corr=AutoCorrelation+y; for(int i=tid*256;i<(tid+1)*256;i++){ float tmp=auto_corr.x; tmp*=tmp; partial_sum+=tmp; if(i!=0 && tmp>partial_max){ partial_max=tmp; idx=(float)i; } } int did=3*tid; PartialSumMaxIdx[did]=partial_sum; PartialSumMaxIdx[did+1]=partial_max; PartialSumMaxIdx[did+2]=idx; if(tid==0){ float full_max=0.0f; float idx=0.0f; float Sum=0.0f; for(int i=0;i<256;i++){ Sum+=PartialSumMaxIdx[3*i]; if(PartialSumMaxIdx[i*3+1]>full_max){ full_max=PartialSumMaxIdx[i*3+1]; idx=PartialSumMaxIdx[i*3+2]; } } result[get_global_id(1)].x=full_max*(128.0f*1024.0f)/Sum;//peak value result[get_global_id(1)].y=idx;//bin } } host code: {size_t localThreads[2]={256,1}; size_t globalThreads[2]={256,8};//R: hardwired number of FFTs for now int i=0; err = clSetKernelArg(FindAutoCorrelation_kernel1_cl,i++,sizeof(cl_mem),(void *)&gpu_WorkData); err |= clSetKernelArg(FindAutoCorrelation_kernel1_cl,i++,sizeof(cl_mem),(void *)&gpu_AutoCorrelationResults); if(err) fprintf(stderr,"ERROR: Setting kernel argument:FindAutoCorrelation_kernel1_cl: %d\n",err); err = clEnqueueNDRangeKernel(cq,FindAutoCorrelation_kernel1_cl, 2, NULL,globalThreads, localThreads, 0,NULL,NULL); if(err) fprintf(stderr,"ERROR: FindAutoCorrelation_reduce0_kernel_cl: %d\n",err); } profiler data: Method ExecutionOrder GlobalWorkSize GroupWorkSize Time LocalMemSize DataTransferSize GPRs ScratchRegs FCStacks Wavefronts ALUInsts FetchInsts WriteInsts LDSFetchInsts LDSWriteInsts ALUBusy ALUFetchRatio ALUPacking FetchSize CacheHit FetchUnitBusy FetchUnitStalled WriteUnitStalled FastPath CompletePath PathUtilization LDSBankConflict FindAutoCorrelation_reduce0_kernel1_cl__k53_Cypress1 259 { 256 8 1} { 256 1 1} 0,39588 0 11 0 1 32,00 1106,00 256,00 1,00 0,00 0,00 1,90 4,32 51,32 4096,00 0,00 0,00 0,00 0,00 23,00 0,00 100,00 0,00 FindAutoCorrelation_kernel1_cl__k53_Cypress1 259 { 256 8 1} { 256 1 1} 1,87470 3072 8 0 2 32,00 2265,50 256,50 0,50 96,25 2,00 1,05 8,83 35,99 16385,00 0,00 0,00 0,00 0,00 0,00 0,00 0,00 0,00

Archives Discussions

Different fetch size for same work