Archives Discussions

Raistmer · ‎07-19-2011

(repost in more suitable forum)

I wrote 2 kernels for HD4xxx GPUs w/o local memory and anothe one that uses local memory.
It happened to be slower. When I looked through profiler data I noticed significant discrepance in FetchSize column. They differs in 4 times! But number of fetch instructions is the same.

Why? Please, look on kernels, host code and profiler reports....
(also, number of ALU instructions differs too much IMHO too)

Non HD5xxx kernels: __kernel void FindAutoCorrelation_reduce0_kernel1_cl(__global float2* AutoCorrelation, __global float* PartialSumMaxIdx){ const int tid=get_global_id(0); const y=get_global_id(1)*(4*128*1024); float partial_sum=0.0f; float partial_max=0.0; float idx=0.0; __global float2* auto_corr=AutoCorrelation+y; for(int i=tid*256;i<(tid+1)*256;i++){ float tmp=auto_corr.x; tmp*=tmp; partial_sum+=tmp; if(i!=0 && tmp>partial_max){ partial_max=tmp; idx=(float)i; } } int did=3*(256*get_global_id(1)+tid); PartialSumMaxIdx[did]=partial_sum; PartialSumMaxIdx[did+1]=partial_max; PartialSumMaxIdx[did+2]=idx; } __kernel void FindAutoCorrelation_reduce1_kernel_cl(__global float* PartialSumMaxIdx, __global float2* result){ const sid=256*get_global_id(0);//only 8 workitems here float full_max=0.0f; float idx=0.0f; float Sum=0.0f; for(int i=0;i<256;i++){ Sum+=PartialSumMaxIdx[3*(sid+i)]; float tmp=PartialSumMaxIdx[(sid+i)*3+1]; if(tmp>full_max){ full_max=tmp; idx=PartialSumMaxIdx[(sid+i)*3+2]; } } result[get_global_id(0)].x=full_max*(128.0f*1024.0f)/Sum;//peak value result[get_global_id(0)].y=idx;//bin } host code: {size_t localThreads[2]={256,1}; size_t globalThreads[2]={256,8};//R: hardwired number of FFTs for now int i=0; err = clSetKernelArg(FindAutoCorrelation_reduce0_kernel_cl,i++,sizeof(cl_mem),(void *)&gpu_WorkData); err |= clSetKernelArg(FindAutoCorrelation_reduce0_kernel_cl,i++,sizeof(cl_mem),(void *)&FFTbuf); if(err) fprintf(stderr,"ERROR: Setting kernel argument:FindAutoCorrelation_reduce0_kernel_cl: %d\n",err); err = clEnqueueNDRangeKernel(cq,FindAutoCorrelation_reduce0_kernel_cl, 2, NULL,globalThreads, localThreads, 0,NULL,NULL); if(err) fprintf(stderr,"ERROR: FindAutoCorrelation_reduce0_kernel_cl: %d\n",err); } { int i=0; size_t localThreads[1]={1}; size_t globalThreads[1]={8};//R: hardwired number of FFTs for now err = clSetKernelArg(FindAutoCorrelation_reduce1_kernel_cl,i++,sizeof(cl_mem),(void *)&FFTbuf); err |= clSetKernelArg(FindAutoCorrelation_reduce1_kernel_cl,i++,sizeof(cl_mem),(void *)&gpu_AutoCorrelationResults); if(err) fprintf(stderr,"ERROR: Setting kernel argument:FindAutoCorrelation_reduce1_kernel_cl: %d\n",err); err = clEnqueueNDRangeKernel(cq,FindAutoCorrelation_reduce1_kernel_cl, 1, NULL,globalThreads, localThreads, 0,NULL,NULL); if(err) fprintf(stderr,"ERROR: FindAutoCorrelation_reduce1_kernel_cl: %d\n",err); } HD5xxx kernel: __kernel void FindAutoCorrelation_kernel1_cl(__global float2* AutoCorrelation, __global float2* result){ __local float* PartialSumMaxIdx[256*3]; const int tid=get_local_id(0); const y=get_global_id(1)*(4*128*1024); float partial_sum=0.0f; float partial_max=0.0; float idx=0.0; __global float2* auto_corr=AutoCorrelation+y; for(int i=tid*256;i<(tid+1)*256;i++){ float tmp=auto_corr.x; tmp*=tmp; partial_sum+=tmp; if(i!=0 && tmp>partial_max){ partial_max=tmp; idx=(float)i; } } int did=3*tid; PartialSumMaxIdx[did]=partial_sum; PartialSumMaxIdx[did+1]=partial_max; PartialSumMaxIdx[did+2]=idx; if(tid==0){ float full_max=0.0f; float idx=0.0f; float Sum=0.0f; for(int i=0;i<256;i++){ Sum+=PartialSumMaxIdx[3*i]; if(PartialSumMaxIdx[i*3+1]>full_max){ full_max=PartialSumMaxIdx[i*3+1]; idx=PartialSumMaxIdx[i*3+2]; } } result[get_global_id(1)].x=full_max*(128.0f*1024.0f)/Sum;//peak value result[get_global_id(1)].y=idx;//bin } } host code: {size_t localThreads[2]={256,1}; size_t globalThreads[2]={256,8};//R: hardwired number of FFTs for now int i=0; err = clSetKernelArg(FindAutoCorrelation_kernel1_cl,i++,sizeof(cl_mem),(void *)&gpu_WorkData); err |= clSetKernelArg(FindAutoCorrelation_kernel1_cl,i++,sizeof(cl_mem),(void *)&gpu_AutoCorrelationResults); if(err) fprintf(stderr,"ERROR: Setting kernel argument:FindAutoCorrelation_kernel1_cl: %d\n",err); err = clEnqueueNDRangeKernel(cq,FindAutoCorrelation_kernel1_cl, 2, NULL,globalThreads, localThreads, 0,NULL,NULL); if(err) fprintf(stderr,"ERROR: FindAutoCorrelation_reduce0_kernel_cl: %d\n",err); } profiler data: Method ExecutionOrder GlobalWorkSize GroupWorkSize Time LocalMemSize DataTransferSize GPRs ScratchRegs FCStacks Wavefronts ALUInsts FetchInsts WriteInsts LDSFetchInsts LDSWriteInsts ALUBusy ALUFetchRatio ALUPacking FetchSize CacheHit FetchUnitBusy FetchUnitStalled WriteUnitStalled FastPath CompletePath PathUtilization LDSBankConflict FindAutoCorrelation_reduce0_kernel1_cl__k53_Cypress1 259 { 256 8 1} { 256 1 1} 0,39588 0 11 0 1 32,00 1106,00 256,00 1,00 0,00 0,00 1,90 4,32 51,32 4096,00 0,00 0,00 0,00 0,00 23,00 0,00 100,00 0,00 FindAutoCorrelation_kernel1_cl__k53_Cypress1 259 { 256 8 1} { 256 1 1} 1,87470 3072 8 0 2 32,00 2265,50 256,50 0,50 96,25 2,00 1,05 8,83 35,99 16385,00 0,00 0,00 0,00 0,00 0,00 0,00 0,00 0,00

MicahVillmow · ‎07-19-2011

Local on HD4XXX is emulated in global! So you are going out to ram with each write/read to local.

Raistmer · ‎07-19-2011

Kernel was profiled on HD5xxx AFAIK. I can't profile it on my HD6950 by myself cause HD6xxx unsupported in profiler I have.
That is, it was profiled on GPU with true local memory.

MicahVillmow · ‎07-19-2011

I know there were some performance problems on SDK 2.3 that were fixed in SDK 2.4 and further improved in the upcoming SDK 2.5(you can get a pre-release from a beta driver for the debugger). Is there any specific reason why you are not able to upgrade?

Raistmer · ‎07-19-2011

Originally posted by: MicahVillmow

I know there were some performance problems on SDK 2.3 that were fixed in SDK 2.4 and further improved in the upcoming SDK 2.5(you can get a pre-release from a beta driver for the debugger). Is there any specific reason why you are not able to upgrade?

Well, yes, there is big specific reason - I see just HUGE slowdown when upgrading from 11.2 to 11.6 or 11.7 catalyst.
I wrote about it in another thread already.
Just today I tried Cat 11.6 under secondary OS, Win 7 x64 (initial attempt was under Vista x86). The same. Performance degradation just huge. Also, it's impossible to run 2 app instances together - I get driver restarts very soon.

What can be done to fix these issues ?

Raistmer · ‎07-19-2011

To be more specific:
Profiler data recived on:

OpenCL Platform Name: ATI Stream
Number of devices: 1
Max compute units: 18
Max work group size: 256
Max clock frequency: 870Mhz
Max memory allocation: 134217728
Cache type: None
Cache line size: 0
Cache size: 0
Global memory size: 536870912
Constant buffer size: 65536
Max number of constant args: 8
Local memory type: Scratchpad
Local memory size: 32768
Queue properties:
Out-of-Order: No
Name: Cypress
Vendor: Advanced Micro Devices, Inc.
Driver version: CAL 1.4.900
Version: OpenCL 1.1 ATI-Stream-v2.3 (451)
Extensions: cl_amd_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_gl_sharing cl_amd_device_attribute_query cl_amd_printf cl_amd_media_ops cl_amd_popcnt cl_khr_d3d10_sharing

Also, HD5 kernel slower on HD6950 too.

Raistmer · ‎07-19-2011

Do I need to recompile my host code with SDK 2.4 libraries to get rid of this slowdown under new drivers?

MicahVillmow · ‎07-19-2011

The official 11.7 release with SDK 2.5 should behave properly, if not 11.8 will have the fix. I know there was some stability issue that was fixed, but I don't know what catalyst it is getting released under. I know there was a performance regression with barriers that should be fixed in 2.5 SDK.

MicahVillmow · ‎07-19-2011

Also, have you tried 11.5? This should be better behaved than 11.6.

Raistmer · ‎07-19-2011

Originally posted by: MicahVillmow

Also, have you tried 11.5? This should be better behaved than 11.6.

No,I didn't try it, will do and report then here.

Raistmer · ‎07-19-2011

And while I do upgrade I would cite one report here:
"
I'm wondering if Cat 11.6/SDK_2.4 has broken one of the apps kernels, I'm intending to drop down to Cat 11.2/SDK2.3 and redo some of the benches,

Downgraded to Cat 11.2/SDK2.3 from Cat 11.6/SDK2.4, and reran bench of r331, app now strongly similar to 6.91 and 6.95"
That is, application performed incorrect computations under 11.6 drivers....

Raistmer · ‎07-20-2011

No luck. Under 11.5 at least one app reports invalid results that even more unacceptable than low performance.
Unroll to 11.2 again....

Raistmer · ‎07-20-2011

BTW, why Cat 11.5 (and up) reports
Max memory allocation: 209715200
Global memory size: 838860800

On 2GB HD6950 GPU. IT's considerably less amount. And such decrease in single buffer size most probably leads to errors I see. It's even less than 256MB that were in initial SDK releases!!!

MicahVillmow · ‎07-20-2011

I know there have been some driver changes in the 11.5/11.6 time frame to allow for dual-gpu's and I believe headless monitor support, this might be causing some of the instability as they are large changes.

Raistmer · ‎07-20-2011

Well, I should discourage you then. Accordingly my info HD5970 still can't work as dual GPU properly. It requires special sync mode setted via env variable to be used as 2 OpenCL devices, but this lead to 100% CPU consumption.
Sad that AMD hardware so outruns their own driver support.

This is that report:
"
I'm running 2xHD5970, and had to add (unofficial/unsupported) Windows system environment variable GPU_USE_SYNC_OBJETCS=1 for the 2 GPUs to even crunch properly. AMD seems to ignore this dual GPU card completely in their driver updates/fix list, all release notes clearly stated HD5970 supported in single GPU mode only(!!!) So the only stable setup for me is CAL 11.3 and this variable. Either version prior to or after will result in way slower crunch times and/or very unstable system.
"

MicahVillmow · ‎07-20-2011

Well, mainly because you will need SDK 2.5 for these features to be enabled, they won't work with SDK 2.4 or 2.3.

Raistmer · ‎07-20-2011

So when SDK 2.5 will be available? And any chance to get it earlier?

ED1980 · ‎07-20-2011

To the end of this month must go out

Raistmer · ‎08-02-2011

Today is 2 August, where it released or where beta can be downloaded?

nou · ‎08-02-2011

well APP 2.5 is part of Windows catalyst 11.7. wonder when we will see APP SDK 2.5 for windows/linux.

Raistmer · ‎08-02-2011

I tried Cat11.7 preview, it didn't thix issues I have with drivers later than 11.2.
Is any chance that Cat11.7 release can be better than 11.7 preview?

Raistmer · ‎08-02-2011

double post

Raistmer · ‎08-08-2011

I didn't try on my own host but already got 2 reports that 11.8 preview still has increased CPU usage over prior-11.7 drivers. Also, this increased CPU usage negatively affects on total runtime too.
Any chance that this problem will be addressed in 11.8 release? Or maybe some changes to app required to make it "VM-driver compatible" ?

Archives Discussions

why different fetch size?