Weird behavior of barrier()

"__kernel void square(                                                   \n" \
"   __global int* nbr,                                          \n" \
"   __global double* BC,                                         \n" \
"   const int ne,                                       \n" \
"   const int nv, \n" \
"   __global int* firstnbr, \n" \
"   __global double* sigma,\n" \
"   __global int* queue, \n" \
"   __global int* queue2, \n" \
"   __global int* nr_queue, \n" \
"   __global int* nr_queue2, \n" \
"   __global int* stack, \n" \
"   __global int* nr_stack, \n" \
"   __global int* succ, \n" \
"   __global int* level, \n" \
"   __global int* nr_level, \n" \
"   __global int *succ_index, \n" \
"   __global int* found, \n" \
"   __global double* delta, \n" \
"  __global int* d, \n" \
"   __global int* sem)\n" \
"{                                                                  \n" \
"   int i = get_local_id(0);                                      \n" \
"int glob = get_global_id(0); \n" \
"   int j = get_num_groups(0);\n" \
" __local int k; k = get_group_id(0); \n" \
"  __local int size; size = get_local_size(0); \n"\
"   sem[0] = 0; \n" \
" \n" \
"   *found = 1; \n" \
   " __local int nr_roots, count,rest,root,nr_neigh; \n " \
   " int neigh_per_thread,h,node,dw,f,gh,temp,nr=0; \n " \
   " queue[0] = 0; \n" \
   " //barrier(CLK_LOCAL_MEM_FENCE); \n" \
   " while ( *found != 0){ \n" \
    "    *found = 0;\n" \
    "  \n" \
     "   //barrier(CLK_LOCAL_MEM_FENCE); \n " \
     "BC[0] = 1; BC[1] = *nr_level%2;\n" \
   "     if(*nr_level%2 == 0) \n" \
    "    {      \n" \
    "            nr_roots = (*nr_queue+1)/j; count = 0; rest = (*nr_queue+1)%j; \n" \
     "           if(k<rest  && i==0) \n" \
      "                  nr_roots=nr_roots + 1; \n" \
       "         *nr_queue2 = 0; if(k==0&&i==0) BC[5] = nr_roots;\n" \
        "        barrier(CLK_LOCAL_MEM_FENCE);   if(k==0&&i==0) BC[6] = nr_roots; \n" \



So what happends is that I need each workgroup to take an equal number of elements from a queue, that has nr_queue elements. 1st step it has 1 element so only group 0 will get 1 element. Thus "nr_roots" becomes 1 and I put it in BC[5]. Then I do a barrier to make sure that all the workitems on the workgroup know that nr_roots is 1, but when I out nr_roots in BC[6] the result is 0. Any1 has any idea why this might be? Thanks!
Maybe you can enable this,

#pragma OPENCL EXTENSION cl_amd_printf : enable

so that you can print nr_roots at those lines that confuse you. Well, this

may not be the smartest way, but it works.


#pragma OPENCL EXTENSION cl_amd_printf : enable

__kernel void async_copy (__global int* in, __global int* out, __local int * sdata)


          int2 threadIdx;

          int2 blockIdx;

          int2 blockDim;

          threadIdx.x = (int)get_local_id(0);

          threadIdx.y = (int)get_local_id(1);

          blockIdx.x = (int)get_group_id(0);

          blockIdx.y = (int)get_group_id(1);

          blockDim.x = (int)get_local_size(0);

          blockDim.y = (int)get_local_size(1);

          int xDim = (int)get_global_size(0);

          int yDim = (int)get_global_size(1);

          int2 idx;

          idx.x = blockDim.x*blockIdx.x + threadIdx.x;

          idx.y = blockDim.y*blockIdx.y + threadIdx.y;

          int xPos = (int)get_global_id(0);

          int yPos = (int)get_global_id(1);


          if((xPos == 0) && (yPos==0))

                    printf("%d.%d %d.%d\n", xPos, yPos, idx.x, idx.y);

          int gidx = yPos*xDim + xPos;

          out[gidx] = gidx;



problem has to do with the fact that the variable needs to be declared with the "volatile" attribute, such that threads read it from the memory and not registers. ( at least thats what I think ) adding volatile seems to solve it


Might have something to do with the fact that BC is a global, but you're using a local barrier.

Nor can barriers be conditional.

Not to mention that every thread is writing to the same addresses.