I have a problem with kernel output , since the kernel works well when I am using one work group, but if I use more than that I couldn't understand the result, for example if I use use 2 work groups, and when I do the following :
this the kernel code: #define BLOCK_SIZE 16 #define BLOCK_COL 4 #define BLOCK_SIZE 16 #define BLOCK_COL 3 __kernel void exmple1( const __global float * C1, __global float * O, const int col, const int hard) { int ar = get_global_id(0); __local float C[BLOCK_SIZE][BLOCK_COL]; if(ar < col * hard) // col =4, hard=3 { C[ar/col][ar%col] = C1[ar]; // col=4 } barrier(CLK_LOCAL_MEM_FENCE); O[ar]= C[0][3] ; // I update this and C[0][3] = 1 } the result will differ between the 2 work groups, for example if I have 8 work items(4 work item in each work group) the the result will be as following: 0= 1 1= 1 2= 1 3= 1 4= 0 5= 0 6= 0 7= 0 the first 4 result is true but the other 4 is wrong since it should be '1'
I guess you are suffering from a misunderstanding with "barriers" in OpenCL.
barriers will only synchronize workitems inside the same workgroup. No synchronization exists between different workgroups.
If that doesn't help you, it would be easier to understand your post if it was better formatted and you supplied some info on what you are trying to do...
My idea was to copy C1 array to a local memory and to make each work-group has its own copy, so how I can copy C1 to each work-group and then use the local C in the computation in each work-group?.
{
int ar = get_global_id(0);
__local float C[BLOCK_SIZE][BLOCK_COL];
if(ar < col * hard) // col =4, hard=3
{
C[ar/col][ar%col] = C1[ar]; // col=4; copy C1 to 2-dim array C
}
barrier(CLK_LOCAL_MEM_FENCE);
O[ar]= C[0][3] ; // O is output buffer & C[0][3] = 1
}
// result
here the C1 is copied only for the first work-group how to make a copy for each work-group
0= 1
1= 1
2= 1
3= 1
4= 0
5= 0
6= 0
7= 0
the first 4 result is true but the other 4 is wrong since it should be '1'
Originally posted by: tameem My idea was to copy C1 array to a local memory and to make each work-group has its own copy, so how I can copy C1 to each work-group and then use the local C in the computation in each work-group?.
{ int ar = get_global_id(0); __local float C[BLOCK_SIZE][BLOCK_COL]; if(ar < col * hard) // col =4, hard=3 { C[ar/col][ar%col] = C1[ar]; // col=4; copy C1 to 2-dim array C } barrier(CLK_LOCAL_MEM_FENCE); O[ar]= C[0][3] ; // O is output buffer & C[0][3] = 1 }
// result here the C1 is copied only for the first work-group how to make a copy for each work-group 0= 1 1= 1 2= 1 3= 1 4= 0 5= 0 6= 0 7= 0 the first 4 result is true but the other 4 is wrong since it should be '1'
It looks like everything is ok from your code. Could you please copy runtime code also here?
I think I should do something for each work group, since when I replace the line O[ar] = C[0][3] with O[ar]=C1[3] (the global memory), it will work fine, but when I use the first assignment which use the local memory it will work fine for the first work group and give 0 for the second, and if I increase the number of work group it will give correct result only for the first work group
Buffer bufferC = Buffer(context,CL_MEM_READ_ONLY ,sizeof(cl_float) * col * hard); Buffer bufferO = Buffer(context,CL_MEM_WRITE_ONLY ,sizeof(cl_float) * row); queue.enqueueWriteBuffer(bufferC, CL_TRUE, 0, sizeof(float) * col * hard, C); kernel.setArg(1, bufferC); kernel.setArg(2, bufferO); kernel.setArg(3, col); kernel.setArg(4, hard); Event event; NDRange globalNDRange(8); //Total number of work items NDRange localNDRange(4); //Work items in each work-group queue.enqueueNDRangeKernel(kernel, NDRange(), globalNDRange, localNDRange, NULL, &event); float *O = new float[8]; queue.enqueueReadBuffer(bufferO, CL_TRUE, 0, sizeof(float) * 8, O ); for(int i=0; i<8; i++) { cout << i << "= " << O << endl; } ---------------------------------------------------------------------------------------------------------- //kernel code #define BLOCK_SIZE 16 #define BLOCK_COL 4 __kernel void exmple1( const __global float * C1, __global float * O, const int col, const int hard) { int ar = get_global_id(0); __local float C[BLOCK_SIZE][BLOCK_COL]; //copy c array.............. if(ar < col * hard)// col =4, hard=3 { C[ar/col][ar%col] = C1[ar]; // col=4; copy C1 to 2-dim array C } barrier(CLK_LOCAL_MEM_FENCE); O[ar]= C[0][3];// O is output buffer & C[0][3] = 1 } ------------------------------------------------------- // result 0= 1 1= 1 2= 1 3= 1 4= 0 5= 0 6= 0 7= 0
In the line where you fill the local memory you fill its contents using the global_id as index:
C[ar/col][ar%col] = C1[ar];
It would make more sense using the local id when indexing C, since this array has one independent "copy" for each work group.
Thank you, its work now with local id 🙂