Hi,
I have a problem in using local memory in kernel to optimize the performance.
I am not getting output from kernel.Please rectify me if i am doing anything weired with local qualifier.output buffer size=1280x720x3.
global thread=1280x720
local thread =16x16
GPU:ATI HD 5850,sdk 2.1, ubuntu 9.10.
#pragma OPENCL EXTENSION cl_amd_printf : enable #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable #define LOCAL_ROW_SIZE 16 #define LOCAL_COL_SIZE 16 __kernel void kernel(__global unsigned char * output, const int rc,const int gc,const int bc , const int ravg,const int gavg,const int bavg, const int ra,const int ga,const int ba) { int B[3]; uint j; __local unsigned char local_temp[LOCAL_ROW_SIZE][LOCAL_COL_SIZE*3]; uint tx = get_global_id(0) * 3; uint ty = get_global_id(1) * 3; uint lx = get_local_id(0); uint ly = get_local_id(1); uint gx = get_group_id(0); uint gy = get_group_id(1); local_temp[ly][lx * 3 + 0] = output[ty * 1280 + tx + 0]; local_temp[ly][lx * 3 + 1] = output[ty * 1280 + tx + 1]; local_temp[ly][lx * 3 + 2] = output[ty * 1280 + tx + 2]; barrier(CLK_LOCAL_MEM_FENCE); B[0] = ((local_temp[ly][lx * 3 + 0] - rc) * ravg) + ra; B[1] = ((local_temp[ly][lx * 3 + 1] - gc) * gavg) + ga; B[2] = ((local_temp[ly][lx * 3 + 2] - bc) * bavg) + ba; for(j=0;j<3;j++) { if(B
< 0) B = 0; if(B > 255) B = 255; output[ty * 1280 + tx + j ] = B ; } }
Hi
The Kernel is working in CPU but i am worried why it is not working in GPU.Please let me know if i have to change something for GPU
Thanks in Advance
What do you mean by no output? Are you getting some error? or incorrect output?
Please make sure you are doing proper error checking.
Also its easy to track down the problem if you give the runtime code too.(A compilable testcase).
Hi Omkar,
The Ouput buffer is NULL(all zeros) from kernel in GPU seems some issue in using __local wherein data from local mem is not copied to output buffer or vise versa.
this kernel is part of a big project .guess may not be feasable to send the complete code and setup.
I have also used the following code before barrier() compared to previous in kernel but output is still NULL ...is there any atomic issues like calculate,read,write and so on.I am cluless why it is not working in GPU.
local_temp[ly][lx * 3 + 0] = output[(gy * 1280 * 3 * WG_HEIGHT_SIZE ) + (gx * 3 * WG_WIDTH_SIZE ) + ((ly * 3 * 1280) + lx * 3 + 0) ];
local_temp[ly][lx * 3 + 1] = output[(gy * 1280 * 3 * WG_HEIGHT_SIZE ) + (gx * 3 * WG_WIDTH_SIZE ) + ((ly * 3 * 1280) + lx * 3 + 1) ];
local_temp[ly][lx * 3 + 2] = output[(gy * 1280 * 3 * WG_HEIGHT_SIZE ) + (gx * 3 * WG_WIDTH_SIZE ) + ((ly * 3 * 1280) + lx * 3 + 2) ];
Actaully I am trying to modify the below kernel in terms of __local usage and for understanding the concept so that i can apply __local mem for performance in other part of program.
**********************************************
__kernel void kernel(__global unsigned char * output,
const int rc,const int gc,const int bc ,
const int ravg,const int gavg,const int bavg,
const int ra,const int ga,const int ba)
{
int B[3];
uint j;
uint tx = get_global_id(0) * 3;
B[0] = ((output[tx] - rc) * ravg) + ra;
B[1] = ((output[tx+1] - gc) * gavg) + ga;
B[2] = ((output[tx+2] - bc) * bavg) + ba;
for(j=0;j<3;j++)
{
if(B
if(B
output[tx+j] = B
}
}
***********************************
Thanks
Pavan
Did you check your buildlog? Any warnings or errors?
You are using extension cl_amd_printf, which is unsupported on GPU.
Its difficult to track down the issue without runtime code. If you can create a small test case which reproduces the issue, it will help us understand the problem easily.
Omkar,
Yaa u r right and thanks ...the extension cl_amd_printf was the issue.It is now working.
Guess i have used __local properly right?
Thanks
Pavan