AnsweredAssumed Answered

Using cl_float3 in parallel reduction example

Question asked by shunyo on May 17, 2013
Latest reply on May 20, 2013 by shunyo

I adapted the parallel reduction example for openCL for a bunch of floats. Now I wanted to expand the code to include cl_float3. So I want to find the minimum among a array of cl_float3. I thought it was a straight forward expansion from float to float3 in kernel. But I am receiving garbage values when i return from the kernel. Below is the kernel:

 

__kernel void pmin3(__global float3  *src,                                           

                    __global float3  *gmin,                                           

                    __local  float3  *lmin,                                           

                    __global float  *dbg,                                            

                    uint           nitems,                                          

                    uint           dev)                                             

{                                                                                   

   uint count  = nitems     / get_global_size(0);                                   

   uint idx    = (dev == 0) ? get_global_id(0) * count                              

                            : get_global_id(0);                                     

   uint stride = (dev == 0) ? 1 : get_global_size(0);                               

                                                                                    

   // Private min for the work-item                                                 

                                                                                    

   float3 pmin = (float3)(pow(2.0,32.0)-1,pow(2.0,32.0)-1,pow(2.0,32.0)-1);                                               

                                                                                    

   for (int n = 0; n < count; n++, idx += stride) {                                 

     pmin.x = min(pmin.x,src[idx].x);

           pmin.y = min(pmin.y,src[idx].y);

           pmin.z = min(pmin.z,src[idx].z);                                                

   }                                                                                

                                                                                    

   // Reduce values within the work-group into local memory                         

                                                                                    

   barrier(CLK_LOCAL_MEM_FENCE);                                                    

   if (get_local_id(0) == 0)

    lmin[0] = (float3)(pow(2.0,32.0)-1,pow(2.0,32.0)-1,pow(2.0,32.0)-1);                                                          

   for (int n = 0; n < get_local_size(0); n++) {                                    

     barrier(CLK_LOCAL_MEM_FENCE);                                                  

     if (get_local_id(0) == n) {

                    lmin[0].x = min(lmin[0].x,pmin.x);

                    lmin[0].y = min(lmin[0].y,pmin.y);

                    lmin[0].z = min(lmin[0].z,pmin.z);

           }                         

   }                                                                                                                                                             

   barrier(CLK_LOCAL_MEM_FENCE);                                                                                                                                    

   // Write to __global gmin which will contain the work-group minima                                                                                               

   if (get_local_id(0) == 0)

          gmin[get_group_id(0)] = lmin[0];                                                                                                       

   // Collect debug information                                                                                                                                       

if (get_global_id(0) == 0) {                                                    

   dbg[0] = get_num_groups(0);                                                   

   dbg[1] = get_global_size(0);                                                  

   dbg[2] = count;                                                               

   dbg[3] = stride;                                                              

}                                                                               

}                      

                                                                                    

__kernel void min_reduce3( __global float3  *gmin)                                         

{                                                                                   

   for (int n = 0; n < get_global_size(0); n++) {                                   

     barrier(CLK_GLOBAL_MEM_FENCE);                                                 

     if (get_global_id(0) == n) {

                    gmin[0].x = min(gmin[0].x,gmin[n].x);

                    gmin[0].y = min(gmin[0].y,gmin[n].y);                     

                    gmin[0].z = min(gmin[0].z,gmin[n].z);

          }

}

barrier(CLK_GLOBAL_MEM_FENCE);                                                                                                                              

}         

 

 

I think it is the problem with get_global_id(0) and get_global_size() which gives the entire size instead of the only the number of rows to be given. Any suggestions?

Outcomes