cancel
Showing results for 
Search instead for 
Did you mean: 

Archives Discussions

t-man
Adept II

Weird behavior of barrier()

"__kernel void square(                                                   \n" \
"   __global int* nbr,                                          \n" \
"   __global double* BC,                                         \n" \
"   const int ne,                                       \n" \
"   const int nv, \n" \
"   __global int* firstnbr, \n" \
"   __global double* sigma,\n" \
"   __global int* queue, \n" \
"   __global int* queue2, \n" \
"   __global int* nr_queue, \n" \
"   __global int* nr_queue2, \n" \
"   __global int* stack, \n" \
"   __global int* nr_stack, \n" \
"   __global int* succ, \n" \
"   __global int* level, \n" \
"   __global int* nr_level, \n" \
"   __global int *succ_index, \n" \
"   __global int* found, \n" \
"   __global double* delta, \n" \
"  __global int* d, \n" \
"   __global int* sem)\n" \
"{                                                                  \n" \
"   int i = get_local_id(0);                                      \n" \
"int glob = get_global_id(0); \n" \
"   int j = get_num_groups(0);\n" \
" __local int k; k = get_group_id(0); \n" \
"  __local int size; size = get_local_size(0); \n"\
"   sem[0] = 0; \n" \
" \n" \
"   *found = 1; \n" \
   " __local int nr_roots, count,rest,root,nr_neigh; \n " \
   " int neigh_per_thread,h,node,dw,f,gh,temp,nr=0; \n " \
   " queue[0] = 0; \n" \
   " //barrier(CLK_LOCAL_MEM_FENCE); \n" \
   " while ( *found != 0){ \n" \
    "    *found = 0;\n" \
    "  \n" \
     "   //barrier(CLK_LOCAL_MEM_FENCE); \n " \
     "BC[0] = 1; BC[1] = *nr_level%2;\n" \
   "     if(*nr_level%2 == 0) \n" \
    "    {      \n" \
    "            nr_roots = (*nr_queue+1)/j; count = 0; rest = (*nr_queue+1)%j; \n" \
     "           if(k<rest  && i==0) \n" \
      "                  nr_roots=nr_roots + 1; \n" \
       "         *nr_queue2 = 0; if(k==0&&i==0) BC[5] = nr_roots;\n" \
        "        barrier(CLK_LOCAL_MEM_FENCE);   if(k==0&&i==0) BC[6] = nr_roots; \n" \

}

}

So what happends is that I need each workgroup to take an equal number of elements from a queue, that has nr_queue elements. 1st step it has 1 element so only group 0 will get 1 element. Thus "nr_roots" becomes 1 and I put it in BC[5]. Then I do a barrier to make sure that all the workitems on the workgroup know that nr_roots is 1, but when I out nr_roots in BC[6] the result is 0. Any1 has any idea why this might be? Thanks!
0 Likes
3 Replies
binying
Challenger

Maybe you can enable this,

#pragma OPENCL EXTENSION cl_amd_printf : enable

so that you can print nr_roots at those lines that confuse you. Well, this

may not be the smartest way, but it works.

-----------------

#pragma OPENCL EXTENSION cl_amd_printf : enable

__kernel void async_copy (__global int* in, __global int* out, __local int * sdata)

{

          int2 threadIdx;

          int2 blockIdx;

          int2 blockDim;

          threadIdx.x = (int)get_local_id(0);

          threadIdx.y = (int)get_local_id(1);

          blockIdx.x = (int)get_group_id(0);

          blockIdx.y = (int)get_group_id(1);

          blockDim.x = (int)get_local_size(0);

          blockDim.y = (int)get_local_size(1);

          int xDim = (int)get_global_size(0);

          int yDim = (int)get_global_size(1);

          int2 idx;

          idx.x = blockDim.x*blockIdx.x + threadIdx.x;

          idx.y = blockDim.y*blockIdx.y + threadIdx.y;

          int xPos = (int)get_global_id(0);

          int yPos = (int)get_global_id(1);

 

          if((xPos == 0) && (yPos==0))

                    printf("%d.%d %d.%d\n", xPos, yPos, idx.x, idx.y);

          int gidx = yPos*xDim + xPos;

          out[gidx] = gidx;

...

}

0 Likes
t-man
Adept II

problem has to do with the fact that the variable needs to be declared with the "volatile" attribute, such that threads read it from the memory and not registers. ( at least thats what I think ) adding volatile seems to solve it

0 Likes
notzed
Challenger

Might have something to do with the fact that BC is a global, but you're using a local barrier.

Nor can barriers be conditional.

Not to mention that every thread is writing to the same addresses.

0 Likes