14 Replies Latest reply on Feb 10, 2011 2:54 PM by MicahVillmow

    Kernel build fails with Dangerous Barrier Opt Detected

    Hellblau
      Kernels group size gets limited to one wavefront.

      Hello,

      i tried the 2.1 SDK to check if my OpenCL code will also run on ATI platforms. Sadly i get the following warnings when compiling some of my kernels:

      Warning:W001:Dangerous Barrier Opt Detected!
      Warning:W000:Barrier caused limited groupsize

      This limits the group size to one wavefront which is quite inefficient. I attached the most simple kernel that exibits the problem. This particular kernel writes indices grouped to global memory when a flag is set. The offending barrier is the first CLK_GLOBAL_MEM_FENCE one. When the loop before the barrier is removed the warning goes away.

      So any help to work around this problem would be appreciated.

      #define REORDER_DIM 256 __kernel /*attribute__( ( reqd_work_group_size( REORDER_DIM, 1, 1 ) ) )*/ void motion_estimation_reorder_candidate_vectors( __global int *pi_indices, __global int *pi_flags, int i_size ) { int i_local_id, i_idx, i_local_idx, i_offset, i_slot_idx, i_flag; volatile int i_dummy; __local unsigned int rgui_indices_block[ REORDER_DIM * 2 ]; __local unsigned int i_indices_offset; i_local_id = get_local_id( 0 ); if( i_local_id == 0 ) { i_indices_offset = 0; } rgui_indices_block[ i_local_id ] = 0; i_slot_idx = i_local_id + REORDER_DIM; for( i_idx = 0; i_idx < i_size; i_idx += REORDER_DIM ) { i_local_idx = min( ( i_local_id + i_idx ), i_size ); pi_indices[ i_local_idx ] = i_size; } barrier(CLK_GLOBAL_MEM_FENCE); for( i_idx = 0; i_idx < i_size; i_idx += REORDER_DIM ) { i_local_idx = i_local_id + i_idx; if( i_local_idx < i_size ) { i_flag = pi_flags[ i_local_idx ]; } else { i_flag = 0; } rgui_indices_block[ i_slot_idx ] = i_flag; for( i_offset = 1; i_offset < REORDER_DIM; i_offset *= 2 ) { unsigned int i_sum; barrier(CLK_LOCAL_MEM_FENCE); i_sum = rgui_indices_block[ i_slot_idx ] + rgui_indices_block[ i_slot_idx - i_offset ]; barrier(CLK_LOCAL_MEM_FENCE); rgui_indices_block[ i_slot_idx ] = i_sum; } if( i_flag ) { unsigned int i_compacted_idx; i_compacted_idx = rgui_indices_block[ i_slot_idx ] + i_indices_offset; pi_indices[ i_compacted_idx ] = i_local_idx; } barrier(CLK_LOCAL_MEM_FENCE); if( i_local_id == REORDER_DIM - 1 ) { i_indices_offset = i_indices_offset + rgui_indices_block[ i_slot_idx ]; } } }