If I declare a __local array in global scope, it compiles, but doesn't work. I can write to the local array from the kernel, but when I read from it, I apparently just get back 0. This is not being flagged by the compiler. It does work with the NVIDIA driver. And of course this kind of semantic works with D3D11 groupshared. For example, here's a simple prefix sum shader. When you uncomment the __local uint line just outside of function scope, the shader doesn't work.
#define NUM_THREADS (1<< NUM_LEVELS)
void ThreadSum2(uint tid, __local uint sharedSum[2 * NUM_THREADS]) {
barrier(CLK_LOCAL_MEM_FENCE);
uint offset = 1;
for(int d = N>> 1; d > 0; d>>= 1) {
if(tid < d) {
uint ai = offset * (2 * tid + 1) - 1;
uint bi = offset * (2 * tid + 2) - 1;
sharedSum[bi] += sharedSum[ai];
}
offset<<= 1;
barrier(CLK_LOCAL_MEM_FENCE);
}
#pragma unroll
for(int d = 1; d < N; d<<= 1) {
offset>>= 1;
if(tid < d) {
uint ai = offset * (2 * tid + 1) - 1;
uint bi = offset * (2 * tid + 2) - 1;
uint t = sharedSum[ai];
sharedSum[ai] = sharedSum[bi];
sharedSum[bi] += t;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
// uncomment me and the code doesn't work. note that it does work with NV driver.
// __local uint sharedSum[2 * NUM_THREADS];
__kernel __attribute__((reqd_work_group_size(NUM_THREADS, 1, 1)))
void PrefixSumBlock_Pass1(
__global uint* pass1_values,
__global uint* pass1_partialSums) {
// uncomment me and the code DOES work.
// __local uint sharedSum[2 * NUM_THREADS];
uint tid = get_local_id(0);
uint gid = get_group_id(0);
uint index = 2 * NUM_THREADS * gid;
uint ai = tid;
uint bi = tid + NUM_THREADS;
uint a = pass1_values[index + ai];
uint b = pass1_values[index + bi];
sharedSum[ai] = a;
sharedSum[bi] = b;
ThreadSum2(tid, sharedSum);
uint total = sharedSum[0];
a = sharedSum[ai] - total;
b = sharedSum[bi] - total;
pass1_values[index + ai] = a;
pass1_values[index + bi] = b;
if(0 == tid)
pass1_partialSums[gid] = total;
}