3 Replies Latest reply on Oct 24, 2013 10:07 AM by max_bodycad

    barrier and global memory

    max_bodycad

      Hi,

       

      using the folloging kernel:

       

      typedef struct my_float3_

      {

        float x,y,z;

      } my_float3;

       

       

      my_float3 make_my_float3(float3 f)

      {

        my_float3 mf3;

        mf3.x = f.x;

        mf3.y = f.y;

        mf3.z = f.z;

        return mf3;

      }

       

       

      kernel void main(constant my_float3 * arg1, constant my_float3 * arg2, float3 normal, global my_float3 * out, global float3* mean)

      {

        size_t index = get_global_id(0);

        my_float3 f1 = arg1[index];

        my_float3 f2 = arg2[index];

       

       

        float3 result = (float3)(f1.x, f1.y, f1.z) + (float3)(f2.x, f2.y, f2.z) + normal;

        out[index] = make_my_float3(result);

       

       

        barrier(CLK_GLOBAL_MEM_FENCE);

       

       

        if(get_global_id(0) == get_global_size(0)-1)

        {

        float3 mean_ = (float3)0;

        for(int i = 0; i < get_global_size(0); i++)

        {

        mean_ += (float3)(out[i].x, out[i].y, out[i].z);

        }

        mean[0] =  mean_ / get_global_size(0);

       

       

        }

       

      }

       

      I get that mean, when read, returns only the correct value with any device if I use  if(get_global_id(0) == get_global_size(0)-1) rather than if(get_global_id(0) == 0). The later works with DEVICE_TYPE_GPU, but  I get garbage with CL_DEVICE_TYPE_CPU. Why is that so?

       

      Also, when the globalNDRage  to low (e.g. 2,1,1) the behavior also appears on GPU.

       

      I use a RadeonHD 7970 and a Core i7 CPU.

       

      Thnak you