6 Replies Latest reply on Jun 4, 2012 10:12 AM by LeeHowes

    Possible bug with atom_cmpxchg

    sh

      The following code seems to be miscompiled:

       

      #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
      #pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
      
      int f(int a) {
          return as_int(as_float(a)+1.0f);
      }
      
      __kernel
      void test(__global int * ptr) {
          int value;
          int new_value;
          do {
              mem_fence(CLK_GLOBAL_MEM_FENCE);
              value = *ptr;
              new_value=f(value);
          } while(value != atom_cmpxchg(ptr, value, new_value));
      }
      

       

       

      shader main
        asic(SI_ASIC)
        type(CS)
      
        s_buffer_load_dword  s0, s[8:11], 0x00                    // 00000000: C2000900
      label_0001:
        s_waitcnt     expcnt(0)                                   // 00000004: BF8C1F0F
        s_waitcnt     lgkmcnt(0)                                  // 00000008: BF8C007F
        v_mov_b32     v0, s0                                      // 0000000C: 7E000200
        tbuffer_load_format_x  v5, v0, s[4:7], 0 offen format:[BUF_DATA_FORMAT_32,BUF_NUM_FORMAT_FLOAT] // 00000010: EBA01000 80010500
        s_waitcnt     vmcnt(0)                                    // 00000018: BF8C1F70
        v_add_f32     v4, 1.0, v5                                 // 0000001C: 06080AF2
        v_mov_b32     v2, v4                                      // 00000020: 7E040304
        v_mov_b32     v3, v5                                      // 00000024: 7E060305
        buffer_atomic_cmpswap  v[2:3], v0, s[4:7], 0 offen glc    // 00000028: E0C45000 80010200
        s_waitcnt     vmcnt(0)                                    // 00000030: BF8C1F70
        v_cmp_eq_i32  vcc, v5, v2                                 // 00000034: 7D040505
        s_cbranch_vccz  label_0001                                // 00000038: BF86FFF2
        s_endpgm                                                  // 0000003C: BF810000
        end
      

       

      The compiler doesn't update execution mask and  atom_cmpxchg  executing more times than it should.

        • Re: Possible bug with atom_cmpxchg
          LeeHowes

          Is it actually compiling wrongly or is it just caching the read of value and hence always giving you the wrong behaviour? Both of the reads are not globally coherent, so they may read from L1. I don't know what the behaviour of the atomic operation is with respect to L1, but at the very least if one lane updates it atomically that is no guarantee that other lanes will pick that value up until after their atomic read.

           

          Does it fix the behaviour to make value = *ptr an atomic read? Replace that line with "value = atomic_add(ptr, 0)".

           

          It does look to not be updating the masks but it may be that that's a side effect of assumptions it knows it can make about the read operation.

          1 of 1 people found this helpful
            • Re: Possible bug with atom_cmpxchg
              sh

              >Does it fix the behaviour to make value = *ptr an atomic read? Replace that line with "value = atomic_add(ptr, 0)".

              It doesn't change anything. So I assume it isn't coherence issue.

               

              #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable  
              #pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable  
              
              int f(int a) {  
                  return as_int(as_float(a)+1.0f);  
              }  
              
              __kernel  
              void test(__global int * ptr) {  
                 int value;  
                 int new_value;  
                 do {  
                     mem_fence(CLK_GLOBAL_MEM_FENCE);  //Not needed actually
                     value = atom_add(ptr, 0);              
                     new_value=f(value);  
                 } while(value != atom_cmpxchg(ptr, value, new_value));  
              }  
              

               

               

              shader main
                asic(SI_ASIC)
                type(CS)
              
                s_buffer_load_dword  s0, s[8:11], 0x00                    // 00000000: C2000900
              label_0001:
                s_waitcnt     expcnt(0)                                   // 00000004: BF8C1F0F
                s_waitcnt     lgkmcnt(0)                                  // 00000008: BF8C007F
                v_mov_b32     v0, s0                                      // 0000000C: 7E000200
                v_mov_b32     v1, 0                                       // 00000010: 7E020280
                buffer_atomic_add  v1, v0, s[4:7], 0 offen glc            // 00000014: E0C85000 80010100
                s_waitcnt     vmcnt(0)                                    // 0000001C: BF8C1F70
                v_add_f32     v2, 1.0, v1                                 // 00000020: 060402F2
                v_mov_b32     v3, v1                                      // 00000024: 7E060301
                buffer_atomic_cmpswap  v[2:3], v0, s[4:7], 0 offen glc    // 00000028: E0C45000 80010200
                s_waitcnt     vmcnt(0)                                    // 00000030: BF8C1F70
                v_cmp_eq_i32  vcc, v1, v2                                 // 00000034: 7D040501
                s_cbranch_vccz  label_0001                                // 00000038: BF86FFF2
                s_endpgm                                                  // 0000003C: BF810000
                end
              
              • Re: Possible bug with atom_cmpxchg
                sh

                Atomic_add doesn't change anything.

                Could you confirm(or reject) the bug?

                  • Re: Possible bug with atom_cmpxchg
                    drallan

                    sh,

                     

                    I agree it does look like a bug where the Tahiti compiler should be using  the exec mask but doesn't. This is easy to test because for Cayman the compiler does use the exec mask and the loop works correctly, looping 64 times (for a single wave), while on Tahiti code fails to loop. The code should execute the same on both platforms. For simplicity, I replaced the f(a) function and the code becomes

                     

                    __kernel void test(__global int * ptr) {

                        int value;  int new_value;


                       do{

                          value = atomic_add(ptr, 0);

                         new_value=value+1;

                        } while(value != atom_cmpxchg(ptr, value, new_value));

                    }

                     

                    During each loop, the first thread to execute atomic_cmpxchg() changes the value of *ptr causing subsequent threads to fail atomic_cmpxchg() and to loop. The Cayman code uses the exec mask to track which threads are active/looping, and loops until all 64 bits of exec are set to 0.  The Tahiti code can only branch if vcc == 0 , which doesn't happen so the code never loops.

                     

                    Another test. You can make the code work correctly on Tahiti by forcing the compiler to use the exec mask, by placing this bogus line just before the do loop,

                     

                    ptr=ptr+(get_local_id(0)&0xff000000);

                     

                    This prevents the compiler from knowing the value of  ptr so it will use the exec mask. In reality, ptr is not changed.

                     

                    drallan

                     

                    //Disassembly without ptr=ptr+(get_local_id(0)&0xff000000);
                     s_buffer_load_dword  s0, s[8:11], 0x00                   
                    label_0001:
                     s_waitcnt     0x0000                                      
                     v_mov_b32     v0, s0                                      
                      v_mov_b32     v1, 0                                       
                     buffer_atomic_add  v1, v0, s[4:7], 0 offen glc            
                     s_waitcnt     vmcnt(0)                                    
                     v_add_i32     v2, vcc, 1, v1                              
                     v_mov_b32     v3, v1                                      
                     buffer_atomic_cmpswap  v[2:3], v0, s[4:7], 0 offen glc    
                     s_waitcnt     vmcnt(0)                                    
                     v_cmp_eq_i32  vcc, v1, v2                                 
                     s_cbranch_vccz  label_0001                                
                     s_branch      label_0010                                  
                     s_branch      label_0001                                  
                    label_0010:
                     s_endpgm 
                    

                     

                    //Disassembly with ptr=ptr+(get_local_id(0)&0xff000000);
                    label_000E:
                     s_waitcnt     0x0000                                      
                     v_mov_b32     v1, 0                                       
                     buffer_atomic_add  v1, v0, s[4:7], 0 offen glc            
                     s_waitcnt     vmcnt(0)                                    
                     v_add_i32     v2, vcc, 1, v1                              
                     v_mov_b32     v3, v1                                      
                     buffer_atomic_cmpswap  v[2:3], v0, s[4:7], 0 offen glc    
                     s_waitcnt     vmcnt(0)                                    
                     v_cmp_eq_i32  s[8:9], v1, v2                              
                     s_mov_b64     s[10:11], exec                              
                     s_and_b64     exec, s[10:11], s[8:9]                      
                     s_andn2_b64   s[2:3], s[2:3], exec                        
                      s_cbranch_scc0 label_0021 
                     s_mov_b64     exec, s[10:11]                              
                     s_and_b64     exec, exec, s[2:3]                          
                     s_branch      label_000E                                  
                    label_0021:
                      s_mov_b64 exec, s[0:1]                                
                     s_endpgm
                    

                     

                    1 of 1 people found this helpful