3 Replies Latest reply on Sep 30, 2009 11:45 AM by dabrunhosa

    Calling Gather Kernel from Parent Kernel

    dabrunhosa

      I'm currently having a problem when I call a gather kernel from another kernel. The .br code follow below:

      kernel void calcular_eq_calor(double input[], out double output<>, double beta, double inverted_beta, double qtd_loop)
      {
          double m ,beta_in;
          double t;

          int i = instance().x;
          int j = i + 1;
          int l = i -1;

          beta_in = 2.0 - beta;

          t = 0.0;
          m = 0.0;

          if(input == 0.0)
              output = 0.0;
          else
          {
              if(qtd_loop < (double)i)
                  output = 0.0;
              else
                  output = (input[j] - (beta_in * input
      ) + input[l]) * inverted_beta;
          }
      }

      kernel void teste(out double input<>, out double output<>, double beta, double inverted_beta, double qtd_loop)
      {
          calcular_eq_calor(input,output,beta,inverted_beta,qtd_loop);

          calcular_eq_calor(output,input,beta,inverted_beta,qtd_loop);

      }

       

      I want to call the gather kernel to calculate all the elements in input, and then i want to get the output of this kernel and use as the input of the next call.

      The host code follows below :

      #include "Eq_Calor.h"
      #include "brookgenfiles/eq_calor_gpu.h"

      CPerfCounter* timer;

      //Para a equação do calor ser estável a equação a seguir precisa
      // ser satisfeita : dt <= (dx*dx)/(2*alpha)
      const double dt = 0.00005; // Delta tempo
      const double dx = 0.1;  // Delta x
      const double t_final = 1;
      const double x_final = 1;
      const double alpha = 1;
      int nx = (int)(1/dx) + 1;
      double beta = (alpha*dx*dx)/dt;


      double _input[100000000] , _input2[100000000];
      int _length;
      float _count;
      double* _output_gpu;
      double* _input_gpu;
      float _result;


      Eq_Calor::Eq_Calor(int num_processos)
      {
          _input_gpu = NULL;
          _output_gpu = NULL;
          _result = 0.0f;
          _length = num_processos;
          _count = 0.0f;
      }

      ////////////////////////////////////////////////////////////////////////////////
      //
      //  \Equal CPU Code
      //
      ////////////////////////////////////////////////////////////////////////////////

      void eq_calor_cpu()
      {
          long int i , m;
          double x , t;

          t = 0;
          m = 0;

         while (t <= t_final)
         {
            // ======================== BEGIN ========================
            i = 1;
            x = (double) i*dx;
            while (x < x_final)
            {
               // ============== BEGIN =============
                _input2 = (_input[i+1] - (2 - beta)*_input + _input[i-1])/beta;         
                // =============== END ===============
                i++;
                x = (double) i*dx;
            }
            // Passar os novos valores para o vetor U
            for (i = 1 ; i < nx; i++)
            {
               _input = _input2;
            }
            // ========================= END =========================
            m++;
            t = (double) m*dt;
         }
      }



      void PreencherStream()
      {
          _input_gpu = (double*) malloc(sizeof(double));
          _output_gpu = (double*) malloc(sizeof(double));
          double x;
          for(int i = 0; i< nx; i++)
          {
              x = (double) i*dx;
              _input =  exp(-((x - 0.5)*(x - 0.5))/(0.01));
              _input_gpu
      = exp(-((x - 0.5)*(x - 0.5))/(0.01));
          }

          // Condicao de contorno de DIRICHLET
         _input[0] = 0;
         _input[nx] = 0;
         _input_gpu[0] = 0;
         _input_gpu[nx] = 0;
      }

      ////////////////////////////////////////////////////////////////////////////////
      //!
      //! \brief  backend implementation for the sample
      //!
      ////////////////////////////////////////////////////////////////////////////////

      bool Eq_Calor::run()
      {
          unsigned int retVal = 0;
          timer = new CPerfCounter();
          double t = 0;
          double m = 0;
          double inverted_beta = 1 / beta;
          double tam_aplicacao = nx + 1;
          double qtd_loop = (x_final / dx) - 1;
          /////////////////////////////////////////////////////////////////////////
          // Brook code block
          /////////////////////////////////////////////////////////////////////////
          {
              unsigned int dim[] = {tam_aplicacao};
              ::brook::Stream<double> inputStream(1, dim);
              ::brook::Stream<double> outputStream(1, dim);

              PreencherStream();
             
              inputStream.read(_input_gpu);

              printf ("\n\nCondicao Inicial\n\n");
              for (int i = 0 ; i< tam_aplicacao; i++)
              {
                  printf ("%.20f\n" , _input);
              }

              timer->Start();
              eq_calor_cpu();
              timer->Stop();
              cout<<"\nPassaram "<<timer->GetElapsedTime()<<" unidades de tempo no execucao na CPU";

              printf ("\n\nResultado FINAL\n\n");
              for (int i = 0 ; i < tam_aplicacao; i++)
              {
                  printf ("%.20f\n" , _input
      );
              }

              timer->Reset();

              double output_teste[12];

              timer->Start();
              while (t <= t_final)
              {
                  calcular_eq_calor(inputStream,outputStream,beta,inverted_beta,qtd_loop);
                  atualizar_input(outputStream,inputStream);

                  m++;
                  t = m * dt;
              }
              timer->Stop();
              cout<<"\nPassaram "<<timer->GetElapsedTime()<<" unidades de tempo no execucao na GPU\n";

              outputStream.write(output_teste);

              printf ("\n\nResultado FINAL\n\n");
              for (int i = 0 ; i < tam_aplicacao; i++)
              {
                  printf ("%.20f\n" , output_teste);
              }
             
              // Handle errors if any
              if(outputStream.error())
              {
                  std::cout << "Error occured" << std::endl;
                  std::cout << outputStream.errorLog() << std::endl;
              }
          }

          /////////////////////////////////////////////////////////////////////////
          // Print results
          /////////////////////////////////////////////////////////////////////////

         
          system("pause");
             
          return true;
      }

       

      The error that i get is :

      >ERROR: ASSERT(func.NumArgs() == (int)mChildElements.size()) failed
      1>While processing <buffer>:196
      1>In compiler at AST::ArgList::LoadFunctionArgs()[astcontrol.cpp:426]
      1>  func.NumArgs() = 6
      1>  mChildElements.size() = 5
      1>Message: Invalid function call
      1>Aborting...

        • Calling Gather Kernel from Parent Kernel
          gaurav.garg

          The compele stream cannot be passed to sub-kernel. Similarly, usage of instance() intrinsics is not allowed inside sub-kernel.

            • Calling Gather Kernel from Parent Kerne
              dabrunhosa

              Can i create a 2D Stream and pass a single line to the sub-kernel ? How I can pass a single line ?

              Thanks for your help.

                • Calling Gather Kernel from Parent Kerne
                  dabrunhosa

                  The Kernel above does work :

                  kernel void calcular_eq_calor(float3 input<>, out float output<>, float beta, float inverted_beta, float qtd_loop)
                  {
                      float m ,beta_in;
                      float t;

                      int i = instance().x;

                      beta_in = 2.0f - beta;

                      if(input.y == 0.0f)
                          output = 0.0f;
                      else
                      {
                          if(qtd_loop < (float)i)
                              output = 0.0f;
                          else
                              output = (input.x - (beta_in * input.y) + input.z) * inverted_beta;
                      }
                  }

                  kernel void atualizar_input(float input[], float qtd_loop,out float3 output<>
                  {
                      int i = instance().x;
                      int j = i - 1;
                      int l = i + 1;
                     
                      if(input == 0.0f)
                          output = float3(0.0f,0.0f,0.0f);
                      else
                      {
                          if(qtd_loop < (float)i)
                              output = float3(0.0f,0.0f,0.0f);
                          else
                          {
                              output.x = input[j];
                              output.y = input
                  ;
                              output.z = input[l];
                          }
                      }
                  }

                   

                  The host Code :

                  while (t <= t_final)
                          {
                              calcular_eq_calor(inputStream,outputStream,beta,inverted_beta,qtd_loop);
                              atualizar_input(outputStream,qtd_loop,inputStream);

                              m++;
                              t = m * dt;
                          }

                  I am trying to pass the host code to the gpu because if I compare the performance of the gpu vs the cpu, the gpu is 1000X slower. I think it's because i calculate one line and return to the cpu, so I have to waste time with the data transfer of the gpu to the cpu.