AnsweredAssumed Answered

DP kernel vs FP one - strange results

Question asked by Raistmer on Feb 4, 2012
Latest reply on Feb 9, 2012 by Raistmer

There are 2 kernels that do essentioally the same (DP does it more accurate).

 

BUT on HD6950 that DP-capable DP kernel tokes more time (and its ISA much bigger) than DP-emulation one.

Any hints why so and what can be changed to speedup DP kernel on compatible hardware?

 

And another question:

ISA for DP emulation is different when double precision extention is enabled or not enabled (if DP not enabled ISA shorter and the kernel is faster !!!!).

Why single-precision kernel "feels" DP mode and why compiler makes it worse if DP allowed?

 

DP emulation:

 

__kernel __attribute__((vec_type_hint(float4)))

void CalcChirpData_kernel2_cl(const float4 chirp_rate, __global float4* cx_DataArray,

                                                                                             __global float4* cx_ChirpDataArray,__global float4* sample_rate) { 

          uint i= get_global_id(0);  

          float4 cx = cx_DataArray[i];

          float4 time = sample_rate[i];

          float sin1,sin2, cos1,cos2;

          float4 chirped;

          float tmp;

          float a = chirp_rate.x;

          float b = time.x;

          float a1= chirp_rate.z;

          float b1= time.z;

          float fa = chirp_rate.y;

          float fb = time.y;    

          float fa1 = chirp_rate.w;

          float fb1 = time.w;    

          float ang  = fract(2 * a * fract(b * fb,&tmp),&tmp) + fract(a * fb * fb,&tmp) +

                    fract(b * fract(b * fa,&tmp),&tmp) + fract(2 * b * fa * fb,&tmp) + fa * fb * fb;

          ang -= floor(ang);

          ang *= M_2PI;

          //sin1=sincos(ang,&cos1);

          sin1=native_sin(ang);

          cos1=native_cos(ang);

          chirped.x = cx.x * cos1 - cx.y * sin1;

          chirped.y = cx.x * sin1 + cx.y * cos1;

          ang  = fract(2 * a1 * fract(b1 * fb1,&tmp),&tmp) + fract(a1 * fb1 * fb1,&tmp) +

                    fract(b1 * fract(b1 * fa1,&tmp),&tmp) + fract(2 * b1 * fa1 * fb1,&tmp) + fa1 * fb1 * fb1;

          ang -= floor(ang);

          ang *= M_2PI;

          //sin2=sincos(ang,&cos2);

          sin2=native_sin(ang);

          cos2=native_cos(ang);

          chirped.z = cx.z * cos2 - cx.w * sin2;

          chirped.w = cx.z * sin2 + cx.w * cos2;

 

 

                    cx_ChirpDataArray[i] = chirped;

}

 

DP kernel:

__kernel

void CalcChirpData_kernel_dp_cl(const double chirp_rate, __global float4* cx_DataArray, __global float4* cx_ChirpDataArray,

                                                                                 const double recip_sample_rate)

                    uint i= get_global_id(0);

 

                     float4 cx=cx_DataArray[i];

                    float4 chirp;

                    float2 c, d;

                    float2 angf;

                    double2 time;

                    time.x = (2*i)*recip_sample_rate;

                    time.y= (2*i+1)*recip_sample_rate;

              double2 ang  = chirp_rate*(time*time);      

                    ang  = ang-floor(ang);

                    angf.x =(float)ang.x;

                    angf.y =(float)ang.y;

                    angf*=M_2PI;

                    d=sincos(angf,&c);       

                              // real = cx.x * c - cx.y * d;

                    chirp.xz = cx.xz*c.xy-cx.yw*d.xy;

                              // imag = cx.x * d + cx.y * c;

                    chirp.yw = cx.xz*d.xy+cx.yw*c.xy;

                    cx_ChirpDataArray[i] = chirp;

}

Outcomes