I found a problem.
I tried to use pointer to __local buffer as MyFuncNG.
But it doesn't work correctly.
I think the pointer is shared among work items within a work group.
Is this a specification or a bug?
I think pointer calculation is faster than index calculation.
void MyFuncOK( __local ushort work[128*128] )
{
size_t Row = get_local_id(0) * 2;
for( size_t m=0; m<128; ++m ){
for( size_t i=0; i<128*2; ++i ){
work[Row*128+i] = Row + i;
}
}
}
void MyFuncNG( __local ushort work[128*128] )
{
__local ushort* p; // shared?
size_t Row = get_local_id(0) * 2;
for( size_t m=0; m<128; ++m ){
p = &work[ Row * 128 ];
for( size_t i=0; i<128; ++i, ++p ){
*p = Row + i;
}
}
}
// This kernel is run by clEnqueueNDRangeKernel(),
// 64 work items is assigned to a 128 * 128 __local buffer.
__kernel void MyKernel(){
__local ushort work[128*128];
MyFuncOK( work );
MyFuncNG( work );
}