0 Replies Latest reply on Sep 8, 2015 9:10 AM by zoli0726

    OpenCL kernel bug

    zoli0726

      Hello Devgurus!

       

      I have a strange bug in my kernel, and i dont know why its happening.

       

      The kernel is quite complex.

      If i use the caching method in bold under then the results not getting written back to the global memory. However if i dont use cohesive_element current_beam and simply use Beamlist[beam_index] everywhere, the result are correct and getting written.

       

      Its happening on Hawaii with the latest driver 15.7.

       

      Please help me with this.

       

      Thanks.

       

      __kernel void beam_force_and_torque(__global Particle* Particles, __global cohesive_element* Beamlist, double gammab, double dt, __global double* platefZ, __global int* neigh_index) {

       

              __local cohesive_element current_beam[64];

              int i, c0, c1;

              double length, xx, yy, zz;

              double beamelo, vreln, magnit, fn, dissip;

              double ffx, ffy, ffz;

              double angle_torsion, omega_torsion;

              double omX0, omX1;

              double angleY0, angleY1, angleZ0, angleZ1, omY0, omZ0, omY1, omZ1;

              double Ty, Tz, Qy, Qz;

              double energy;

       

       

              vec3d e, dv, e0, e1;

              vec3d t0x, t0y, t0z;

              vec3d t0;

              vec3d shr0yS, shr0zS;

              vec3d om0, om1;

       

              int part_index = get_global_id(0);

              int lindex = get_local_id(0);

              Particle current_particle = Particles[part_index];

       

              platefZ[part_index] = 0;

       

              for(i = 0; i < current_particle.NumNb; i++) {

                      if (Beamlist[neigh_index[part_index] + i].exist) {

                      int beam_index = neigh_index[part_index] + i;

                     current_beam[lindex] = Beamlist[beam_index];

       

                              ffx = 0.0;

                              ffy = 0.0;

                              ffz = 0.0;

       

                              c0 = current_beam[lindex].end[0];

                              c1 = current_beam[lindex].end[1];

                              e.x = Particles[c1].rx - current_particle.rx;

                              e.y = Particles[c1].ry - current_particle.ry;

                              e.z = Particles[c1].rz - current_particle.rz;

                              length = vec3d_length(e);

                              beamelo = length - current_beam[lindex].length0;

                              current_beam[lindex].elong = beamelo;

                              e.x = e.x / length;

                              e.y = e.y / length;

                              e.z = e.z / length;

       

                              dv.x = Particles[c1].rx1 - current_particle.rx1;

                              dv.y = Particles[c1].ry1 - current_particle.ry1;

                              dv.z = Particles[c1].rz1 - current_particle.rz1;

       

                              vreln = scalar_prod(dv, e);

                              fn = beamelo * current_beam[lindex].longit;

                              dissip = gammab*vreln;

       

                              ffx = fn * e.x + dissip * e.x;

                              ffy = fn * e.y + dissip * e.y;

                              ffz = fn * e.z + dissip * e.z;

       

                              current_particle.fx += ffx;

                              current_particle.fy += ffy;

                              current_particle.fz += ffz;

       

       

                              om0.x = current_particle.ox;

                              om0.y = current_particle.oy;

                              om0.z = current_particle.oz;

       

                              om1.x = Particles[c1].ox;

                              om1.y = Particles[c1].oy;

                              om1.z = Particles[c1].oz;

       

                              omX0 = scalar_prod(om0, current_beam[lindex].b0x);

                              omX1 = scalar_prod(om1, current_beam[lindex].b0x);

                              omega_torsion = omX0 - omX1;

       

                              angle_torsion = current_beam[lindex].angle_Tor + omega_torsion*dt;

                              current_beam[lindex].angle_Tor = angle_torsion;

       

                              magnit = current_beam[lindex].torsion * (-angle_torsion - gammab * omega_torsion);

                              t0x = scalar_prod_vec3d(magnit, current_beam[lindex].b0x);

       

                              magnit = -magnit;

       

                              e0 = rotate_vector_3d(e, current_particle.ROTA);

       

                              xx = scalar_prod(e0, current_beam[lindex].b0x);

                              yy = scalar_prod(e0, current_beam[lindex].b0y);

                              zz = scalar_prod(e0, current_beam[lindex].b0z);

       

                              angleY0 =  -atan2(zz,xx);

                              angleZ0 =  -atan2(yy,xx);

       

                              current_beam[lindex].Theta0 = acos(xx);

       

                              omY0 = (angleY0 - current_beam[lindex].angle0_Y) / dt;

                              omZ0 = (angleZ0 - current_beam[lindex].angle0_Z) / dt;

                              current_beam[lindex].angle0_Y = angleY0;

                              current_beam[lindex].angle0_Z = angleZ0;

       

                              e1 = rotate_vector_3d(e, Particles[c1].ROTA);

                              xx = scalar_prod(e1, current_beam[lindex].b0x);

                              yy = scalar_prod(e1, current_beam[lindex].b0y);

                              zz = scalar_prod(e1, current_beam[lindex].b0z);

       

                              angleY1 =  -atan2(zz,xx);

                              angleZ1 =  -atan2(yy,xx);

       

                              current_beam[lindex].Theta1 = acos(xx);

       

                              omY1 = (angleY1 - current_beam[lindex].angle1_Y) / dt;

                              omZ1 = (angleZ1 - current_beam[lindex].angle1_Z) / dt;

       

                              current_beam[lindex].angle1_Y = angleY1;

                              current_beam[lindex].angle1_Z = angleZ1;

       

                              Qz = 3.0 * (current_beam[lindex].bending * (angleZ1 + angleZ0) + gammab * (omZ1 + omZ0)) / current_beam[lindex].length0;

                              Tz = current_beam[lindex].bending * (angleZ1 - angleZ0) + gammab * (omZ1 - omZ0);

                              

                              Qy = 3.0 * (current_beam[lindex].bending * (angleY1 + angleY0) + gammab * (omY1 + omY0)) / current_beam[lindex].length0;

                              Ty = current_beam[lindex].bending * (angleY1 - angleY0) + gammab * (omY1 - omY0);

       

                              magnit = Ty - Qy*length;

                              t0y = scalar_prod_vec3d(magnit, current_beam[lindex].b0y);

       

                              magnit = -(Ty + Qy * length);

       

                              magnit = Tz - Qz*length;

                              t0z = scalar_prod_vec3d(magnit, current_beam[lindex].b0z);

       

                              magnit = -(Tz + Qz * length);

       

                              shr0yS = scalar_prod_vec3d(-Qy, current_beam[lindex].b0z);

                              shr0zS = scalar_prod_vec3d(-Qz, current_beam[lindex].b0y);

       

                              shr0yS = invrot_vector_3d(shr0yS, current_particle.ROTA);

                              shr0zS = invrot_vector_3d(shr0zS, current_particle.ROTA);

       

                              t0.x = t0x.x - t0y.x + t0z.x;

                              t0.y = t0x.y - t0y.y + t0z.y;

                              t0.z = t0x.z - t0y.z + t0z.z;

       

                              current_particle.tx += t0.x;

                              current_particle.ty += t0.y;

                              current_particle.tz += t0.z;

       

                              current_particle.fx = current_particle.fx + shr0yS.x + shr0zS.x;

                              current_particle.fy = current_particle.fy + shr0yS.y + shr0zS.y;

                              current_particle.fz = current_particle.fz + shr0yS.z + shr0zS.z;

       

                              energy = 0.5 * fn * fn / current_beam[lindex].longit +

                                      0.5 * current_beam[lindex].torsion * angle_torsion * angle_torsion +

                                      current_beam[lindex].bending * (angleY0 * angleY0 + angleY1 * angleY1 + angleY0 * angleY1 +

                                      angleZ0 * angleZ0 + angleZ1 * angleZ1 + angleZ0 * angleZ1);

       

                              current_beam[lindex].Energy = energy;

                              double test1 = ffz + shr0yS.z + shr0zS.z;

       

                              if (current_particle.PState == 1 && Particles[c1].PState == 0) {

                                      platefZ[part_index] += test1;

                              }      

       

                             Beamlist[beam_index] = current_beam[lindex];

                      }

              }

              Particles[part_index] = current_particle;                      

      }

       

      Az üzenetet szerkesztette: Zoltan Janosi