22 Replies Latest reply on Aug 8, 2011 9:18 AM by Raistmer

    why different fetch size?

    Raistmer
      (repost in more suitable forum)

      I wrote 2 kernels for HD4xxx GPUs w/o local memory and anothe one that uses local memory.
      It happened to be slower. When I looked through profiler data I noticed significant discrepance in FetchSize column. They differs in 4 times! But number of fetch instructions is the same.

      Why? Please, look on kernels, host code and profiler reports....
      (also, number of ALU instructions differs too much IMHO too)

      Non HD5xxx kernels: __kernel void FindAutoCorrelation_reduce0_kernel1_cl(__global float2* AutoCorrelation, __global float* PartialSumMaxIdx){ const int tid=get_global_id(0); const y=get_global_id(1)*(4*128*1024); float partial_sum=0.0f; float partial_max=0.0; float idx=0.0; __global float2* auto_corr=AutoCorrelation+y; for(int i=tid*256;i<(tid+1)*256;i++){ float tmp=auto_corr[i].x; tmp*=tmp; partial_sum+=tmp; if(i!=0 && tmp>partial_max){ partial_max=tmp; idx=(float)i; } } int did=3*(256*get_global_id(1)+tid); PartialSumMaxIdx[did]=partial_sum; PartialSumMaxIdx[did+1]=partial_max; PartialSumMaxIdx[did+2]=idx; } __kernel void FindAutoCorrelation_reduce1_kernel_cl(__global float* PartialSumMaxIdx, __global float2* result){ const sid=256*get_global_id(0);//only 8 workitems here float full_max=0.0f; float idx=0.0f; float Sum=0.0f; for(int i=0;i<256;i++){ Sum+=PartialSumMaxIdx[3*(sid+i)]; float tmp=PartialSumMaxIdx[(sid+i)*3+1]; if(tmp>full_max){ full_max=tmp; idx=PartialSumMaxIdx[(sid+i)*3+2]; } } result[get_global_id(0)].x=full_max*(128.0f*1024.0f)/Sum;//peak value result[get_global_id(0)].y=idx;//bin } host code: {size_t localThreads[2]={256,1}; size_t globalThreads[2]={256,8};//R: hardwired number of FFTs for now int i=0; err = clSetKernelArg(FindAutoCorrelation_reduce0_kernel_cl,i++,sizeof(cl_mem),(void *)&gpu_WorkData); err |= clSetKernelArg(FindAutoCorrelation_reduce0_kernel_cl,i++,sizeof(cl_mem),(void *)&FFTbuf); if(err) fprintf(stderr,"ERROR: Setting kernel argument:FindAutoCorrelation_reduce0_kernel_cl: %d\n",err); err = clEnqueueNDRangeKernel(cq,FindAutoCorrelation_reduce0_kernel_cl, 2, NULL,globalThreads, localThreads, 0,NULL,NULL); if(err) fprintf(stderr,"ERROR: FindAutoCorrelation_reduce0_kernel_cl: %d\n",err); } { int i=0; size_t localThreads[1]={1}; size_t globalThreads[1]={8};//R: hardwired number of FFTs for now err = clSetKernelArg(FindAutoCorrelation_reduce1_kernel_cl,i++,sizeof(cl_mem),(void *)&FFTbuf); err |= clSetKernelArg(FindAutoCorrelation_reduce1_kernel_cl,i++,sizeof(cl_mem),(void *)&gpu_AutoCorrelationResults); if(err) fprintf(stderr,"ERROR: Setting kernel argument:FindAutoCorrelation_reduce1_kernel_cl: %d\n",err); err = clEnqueueNDRangeKernel(cq,FindAutoCorrelation_reduce1_kernel_cl, 1, NULL,globalThreads, localThreads, 0,NULL,NULL); if(err) fprintf(stderr,"ERROR: FindAutoCorrelation_reduce1_kernel_cl: %d\n",err); } HD5xxx kernel: __kernel void FindAutoCorrelation_kernel1_cl(__global float2* AutoCorrelation, __global float2* result){ __local float* PartialSumMaxIdx[256*3]; const int tid=get_local_id(0); const y=get_global_id(1)*(4*128*1024); float partial_sum=0.0f; float partial_max=0.0; float idx=0.0; __global float2* auto_corr=AutoCorrelation+y; for(int i=tid*256;i<(tid+1)*256;i++){ float tmp=auto_corr[i].x; tmp*=tmp; partial_sum+=tmp; if(i!=0 && tmp>partial_max){ partial_max=tmp; idx=(float)i; } } int did=3*tid; PartialSumMaxIdx[did]=partial_sum; PartialSumMaxIdx[did+1]=partial_max; PartialSumMaxIdx[did+2]=idx; if(tid==0){ float full_max=0.0f; float idx=0.0f; float Sum=0.0f; for(int i=0;i<256;i++){ Sum+=PartialSumMaxIdx[3*i]; if(PartialSumMaxIdx[i*3+1]>full_max){ full_max=PartialSumMaxIdx[i*3+1]; idx=PartialSumMaxIdx[i*3+2]; } } result[get_global_id(1)].x=full_max*(128.0f*1024.0f)/Sum;//peak value result[get_global_id(1)].y=idx;//bin } } host code: {size_t localThreads[2]={256,1}; size_t globalThreads[2]={256,8};//R: hardwired number of FFTs for now int i=0; err = clSetKernelArg(FindAutoCorrelation_kernel1_cl,i++,sizeof(cl_mem),(void *)&gpu_WorkData); err |= clSetKernelArg(FindAutoCorrelation_kernel1_cl,i++,sizeof(cl_mem),(void *)&gpu_AutoCorrelationResults); if(err) fprintf(stderr,"ERROR: Setting kernel argument:FindAutoCorrelation_kernel1_cl: %d\n",err); err = clEnqueueNDRangeKernel(cq,FindAutoCorrelation_kernel1_cl, 2, NULL,globalThreads, localThreads, 0,NULL,NULL); if(err) fprintf(stderr,"ERROR: FindAutoCorrelation_reduce0_kernel_cl: %d\n",err); } profiler data: Method ExecutionOrder GlobalWorkSize GroupWorkSize Time LocalMemSize DataTransferSize GPRs ScratchRegs FCStacks Wavefronts ALUInsts FetchInsts WriteInsts LDSFetchInsts LDSWriteInsts ALUBusy ALUFetchRatio ALUPacking FetchSize CacheHit FetchUnitBusy FetchUnitStalled WriteUnitStalled FastPath CompletePath PathUtilization LDSBankConflict FindAutoCorrelation_reduce0_kernel1_cl__k53_Cypress1 259 { 256 8 1} { 256 1 1} 0,39588 0 11 0 1 32,00 [b]1106,00[/b] 256,00 1,00 0,00 0,00 1,90 4,32 51,32 [b]4096,00[/b] 0,00 0,00 0,00 0,00 23,00 0,00 100,00 0,00 FindAutoCorrelation_kernel1_cl__k53_Cypress1 259 { 256 8 1} { 256 1 1} 1,87470 3072 8 0 2 32,00 [b]2265,50[/b] 256,50 0,50 96,25 2,00 1,05 8,83 35,99 [b]16385,00[/b] 0,00 0,00 0,00 0,00 0,00 0,00 0,00 0,00

        • why different fetch size?
          MicahVillmow
          Local on HD4XXX is emulated in global! So you are going out to ram with each write/read to local.
          • why different fetch size?
            Raistmer
            Kernel was profiled on HD5xxx AFAIK. I can't profile it on my HD6950 by myself cause HD6xxx unsupported in profiler I have.
            That is, it was profiled on GPU with true local memory.
            • why different fetch size?
              MicahVillmow
              I know there were some performance problems on SDK 2.3 that were fixed in SDK 2.4 and further improved in the upcoming SDK 2.5(you can get a pre-release from a beta driver for the debugger). Is there any specific reason why you are not able to upgrade?
                • why different fetch size?
                  Raistmer
                  Originally posted by: MicahVillmow

                  I know there were some performance problems on SDK 2.3 that were fixed in SDK 2.4 and further improved in the upcoming SDK 2.5(you can get a pre-release from a beta driver for the debugger). Is there any specific reason why you are not able to upgrade?



                  Well, yes, there is big specific reason - I see just HUGE slowdown when upgrading from 11.2 to 11.6 or 11.7 catalyst.
                  I wrote about it in another thread already.
                  Just today I tried Cat 11.6 under secondary OS, Win 7 x64 (initial attempt was under Vista x86). The same. Performance degradation just huge. Also, it's impossible to run 2 app instances together - I get driver restarts very soon.

                  What can be done to fix these issues ?
                • why different fetch size?
                  Raistmer
                  To be more specific:
                  Profiler data recived on:


                  OpenCL Platform Name: ATI Stream
                  Number of devices: 1
                  Max compute units: 18
                  Max work group size: 256
                  Max clock frequency: 870Mhz
                  Max memory allocation: 134217728
                  Cache type: None
                  Cache line size: 0
                  Cache size: 0
                  Global memory size: 536870912
                  Constant buffer size: 65536
                  Max number of constant args: 8
                  Local memory type: Scratchpad
                  Local memory size: 32768
                  Queue properties:
                  Out-of-Order: No
                  Name: Cypress
                  Vendor: Advanced Micro Devices, Inc.
                  Driver version: CAL 1.4.900
                  Version: OpenCL 1.1 ATI-Stream-v2.3 (451)
                  Extensions: cl_amd_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_gl_sharing cl_amd_device_attribute_query cl_amd_printf cl_amd_media_ops cl_amd_popcnt cl_khr_d3d10_sharing


                  Also, HD5 kernel slower on HD6950 too.
                  • why different fetch size?
                    Raistmer
                    Do I need to recompile my host code with SDK 2.4 libraries to get rid of this slowdown under new drivers?
                    • why different fetch size?
                      MicahVillmow
                      The official 11.7 release with SDK 2.5 should behave properly, if not 11.8 will have the fix. I know there was some stability issue that was fixed, but I don't know what catalyst it is getting released under. I know there was a performance regression with barriers that should be fixed in 2.5 SDK.
                      • why different fetch size?
                        MicahVillmow
                        Also, have you tried 11.5? This should be better behaved than 11.6.
                        • why different fetch size?
                          Raistmer
                          And while I do upgrade I would cite one report here:
                          "
                          I'm wondering if Cat 11.6/SDK_2.4 has broken one of the apps kernels, I'm intending to drop down to Cat 11.2/SDK2.3 and redo some of the benches,

                          Downgraded to Cat 11.2/SDK2.3 from Cat 11.6/SDK2.4, and reran bench of r331, app now strongly similar to 6.91 and 6.95"
                          That is, application performed incorrect computations under 11.6 drivers....
                          • why different fetch size?
                            Raistmer
                            No luck. Under 11.5 at least one app reports invalid results that even more unacceptable than low performance.
                            Unroll to 11.2 again....
                            • why different fetch size?
                              Raistmer
                              BTW, why Cat 11.5 (and up) reports
                              Max memory allocation: 209715200
                              Global memory size: 838860800

                              On 2GB HD6950 GPU. IT's considerably less amount. And such decrease in single buffer size most probably leads to errors I see. It's even less than 256MB that were in initial SDK releases!!!
                              • why different fetch size?
                                MicahVillmow
                                I know there have been some driver changes in the 11.5/11.6 time frame to allow for dual-gpu's and I believe headless monitor support, this might be causing some of the instability as they are large changes.
                                • why different fetch size?
                                  Raistmer
                                  Well, I should discourage you then. Accordingly my info HD5970 still can't work as dual GPU properly. It requires special sync mode setted via env variable to be used as 2 OpenCL devices, but this lead to 100% CPU consumption.
                                  Sad that AMD hardware so outruns their own driver support.

                                  This is that report:
                                  "
                                  I'm running 2xHD5970, and had to add (unofficial/unsupported) Windows system environment variable GPU_USE_SYNC_OBJETCS=1 for the 2 GPUs to even crunch properly. AMD seems to ignore this dual GPU card completely in their driver updates/fix list, all release notes clearly stated HD5970 supported in single GPU mode only(!!!) So the only stable setup for me is CAL 11.3 and this variable. Either version prior to or after will result in way slower crunch times and/or very unstable system.
                                  "
                                  • why different fetch size?
                                    MicahVillmow
                                    Well, mainly because you will need SDK 2.5 for these features to be enabled, they won't work with SDK 2.4 or 2.3.
                                    • why different fetch size?
                                      Raistmer
                                      So when SDK 2.5 will be available? And any chance to get it earlier?
                                      • why different fetch size?
                                        Raistmer
                                        Today is 2 August, where it released or where beta can be downloaded?
                                        • why different fetch size?
                                          Raistmer
                                          I tried Cat11.7 preview, it didn't thix issues I have with drivers later than 11.2.
                                          Is any chance that Cat11.7 release can be better than 11.7 preview?
                                          • why different fetch size?
                                            Raistmer
                                            double post
                                            • why different fetch size?
                                              Raistmer
                                              I didn't try on my own host but already got 2 reports that 11.8 preview still has increased CPU usage over prior-11.7 drivers. Also, this increased CPU usage negatively affects on total runtime too.
                                              Any chance that this problem will be addressed in 11.8 release? Or maybe some changes to app required to make it "VM-driver compatible" ?