A bug shows off with recent driver on FuryX, Win10-64 (maybe some versions before that, but it surely was not there with 17.5.1).
Here is example prefix sum code for thread group size of 256, but size of 64 is also broken.
It reminds me to an older OpenCL bug that has been fixed OpenCL Driver Bug FuryX 32bit ,
but unlike the chaotic behaviour from that this time the reults are consistent and more predictable.
Test files have already been sent to co compiler team by dwitczak.
#version 450
#define WG_WIDTH 256
layout (local_size_x = WG_WIDTH) in;
layout (std430, binding = 0) buffer bTEST { float _G_test[]; };
void main ()
{
uint lID = gl_LocalInvocationID.x;
uint index = lID;
_lds[lID] = 1;
memoryBarrierShared(); barrier();
#if 1 // wrong result: (1...128), (1...128) instead (1...256)
if (lID<(WG_WIDTH>>1)) _lds[(((lID >> 0) << 1) | (lID & 0) | 1) ] += _lds[(((lID >> 0) << 1) | 0) ]; memoryBarrierShared(); barrier();
if (lID<(WG_WIDTH>>1)) _lds[(((lID >> 1) << 2) | (lID & 1) | 2) ] += _lds[(((lID >> 1) << 2) | 1) ]; memoryBarrierShared(); barrier();
if (lID<(WG_WIDTH>>1)) _lds[(((lID >> 2) << 3) | (lID & 3) | 4) ] += _lds[(((lID >> 2) << 3) | 3) ]; memoryBarrierShared(); barrier();
if (lID<(WG_WIDTH>>1)) _lds[(((lID >> 3) << 4) | (lID & 7) | 😎 ] += _lds[(((lID >> 3) << 4) | 7) ]; memoryBarrierShared(); barrier();
if (lID<(WG_WIDTH>>1)) _lds[(((lID >> 4) << 5) | (lID & 15) | 16) ] += _lds[(((lID >> 4) << 5) | 15) ]; memoryBarrierShared(); barrier();
if (lID<(WG_WIDTH>>1)) _lds[(((lID >> 5) << 6) | (lID & 31) | 32) ] += _lds[(((lID >> 5) << 6) | 31) ]; memoryBarrierShared(); barrier();
if (lID<(WG_WIDTH>>1)) _lds[(((lID >> 6) << 7) | (lID & 63) | 64) ] += _lds[(((lID >> 6) << 7) | 63) ]; memoryBarrierShared(); barrier();
if (lID<(WG_WIDTH>>1)) _lds[(((lID >> 7) << 😎 | (lID & 127) | 128) ] += _lds[(((lID >> 7) << 😎 | 127) ]; memoryBarrierShared(); barrier();
#else // wrong result: (1...64), (1...64), (1...32), (1...32), (1...32), (1...32)
if (lID<(WG_WIDTH>>1)) _lds[(((lID >> 0) << 1) | (lID & 0) | 1) ] += _lds[(((lID >> 0) << 1) | 0) ]; memoryBarrierShared(); barrier();
if (lID<(WG_WIDTH>>1)) _lds[(((lID >> 1) << 2) | (lID & 1) | 2) ] += _lds[(((lID >> 1) << 2) | 1) ]; memoryBarrierShared(); barrier();
if (lID<(WG_WIDTH>>1)) _lds[(((lID >> 2) << 3) | (lID & 3) | 4) ] += _lds[(((lID >> 2) << 3) | 3) ]; memoryBarrierShared(); barrier();
if (lID<(WG_WIDTH>>1)) _lds[(((lID >> 3) << 4) | (lID & 7) | 😎 ] += _lds[(((lID >> 3) << 4) | 7) ]; memoryBarrierShared(); barrier();
if (lID<(WG_WIDTH>>1)) _lds[(((lID >> 4) << 5) | (lID & 15) | 16) ] += _lds[(((lID >> 4) << 5) | 15) ]; memoryBarrierShared(); barrier();
if (lID<(WG_WIDTH>>1)) _lds[(((lID >> 5) << 6) | (lID & 31) | 32) ] += _lds[(((lID >> 5) << 6) | 31) ]; memoryBarrierShared(); barrier();
#endif
_G_test[index] = float(_lds[lID]);
}