Archives Discussions

dabrunhosa · ‎09-29-2009

I'm currently having a problem when I call a gather kernel from another kernel. The .br code follow below:

kernel void calcular_eq_calor(double input[], out double output<>, double beta, double inverted_beta, double qtd_loop)
{
    double m ,beta_in;
    double t;

    int i = instance().x;
    int j = i + 1;
    int l = i -1;

    beta_in = 2.0 - beta;

    t = 0.0;
    m = 0.0;

    if(input == 0.0)
        output = 0.0;
    else
    {
        if(qtd_loop < (double)i)
            output = 0.0;
        else
            output = (input - (beta_in * input) + input) * inverted_beta;
    }
}

kernel void teste(out double input<>, out double output<>, double beta, double inverted_beta, double qtd_loop)
{
    calcular_eq_calor(input,output,beta,inverted_beta,qtd_loop);

calcular_eq_calor(output,input,beta,inverted_beta,qtd_loop);

}

I want to call the gather kernel to calculate all the elements in input, and then i want to get the output of this kernel and use as the input of the next call.

The host code follows below :

#include "Eq_Calor.h"
#include "brookgenfiles/eq_calor_gpu.h"

CPerfCounter* timer;

//Para a equação do calor ser estável a equação a seguir precisa
// ser satisfeita : dt <= (dx*dx)/(2*alpha)
const double dt = 0.00005; // Delta tempo
const double dx = 0.1; // Delta x
const double t_final = 1;
const double x_final = 1;
const double alpha = 1;
int nx = (int)(1/dx) + 1;
double beta = (alpha*dx*dx)/dt;

double _input[100000000] , _input2[100000000];
int _length;
float _count;
double* _output_gpu;
double* _input_gpu;
float _result;

Eq_Calor::Eq_Calor(int num_processos)
{
    _input_gpu = NULL;
    _output_gpu = NULL;
    _result = 0.0f;
    _length = num_processos;
    _count = 0.0f;
}

////////////////////////////////////////////////////////////////////////////////
//
// \Equal CPU Code
//
////////////////////////////////////////////////////////////////////////////////

void eq_calor_cpu()
{
    long int i , m;
    double x , t;

    t = 0;
    m = 0;

   while (t <= t_final)
   {
      // ======================== BEGIN ========================
      i = 1;
      x = (double) i*dx;
      while (x < x_final)
      {
         // ============== BEGIN =============
          _input2 = (_input[i+1] - (2 - beta)*_input + _input[i-1])/beta;
          // =============== END ===============
          i++;
          x = (double) i*dx;
      }
      // Passar os novos valores para o vetor U
      for (i = 1 ; i < nx; i++)
      {
         _input = _input2;
      }
      // ========================= END =========================
      m++;
      t = (double) m*dt;
   }
}

void PreencherStream()
{
    _input_gpu = (double*) malloc(sizeof(double));
    _output_gpu = (double*) malloc(sizeof(double));
    double x;
    for(int i = 0; i< nx; i++)
    {
        x = (double) i*dx;
        _input = exp(-((x - 0.5)*(x - 0.5))/(0.01));
        _input_gpu = exp(-((x - 0.5)*(x - 0.5))/(0.01));
    }

    // Condicao de contorno de DIRICHLET
   _input[0] = 0;
   _input[nx] = 0;
   _input_gpu[0] = 0;
   _input_gpu[nx] = 0;
}

////////////////////////////////////////////////////////////////////////////////
//!
//! \brief backend implementation for the sample
//!
////////////////////////////////////////////////////////////////////////////////

bool Eq_Calor::run()
{
    unsigned int retVal = 0;
    timer = new CPerfCounter();
    double t = 0;
    double m = 0;
    double inverted_beta = 1 / beta;
    double tam_aplicacao = nx + 1;
    double qtd_loop = (x_final / dx) - 1;
    /////////////////////////////////////////////////////////////////////////
    // Brook code block
    /////////////////////////////////////////////////////////////////////////
    {
        unsigned int dim[] = {tam_aplicacao};
        ::brook::Stream<double> inputStream(1, dim);
        ::brook::Stream<double> outputStream(1, dim);

        PreencherStream();

        inputStream.read(_input_gpu);

        printf ("\n\nCondicao Inicial\n\n");
        for (int i = 0 ; i< tam_aplicacao; i++)
        {
            printf ("%.20f\n" , _input);
        }

        timer->Start();
        eq_calor_cpu();
        timer->Stop();
        cout<<"\nPassaram "<<timer->GetElapsedTime()<<" unidades de tempo no execucao na CPU";

        printf ("\n\nResultado FINAL\n\n");
        for (int i = 0 ; i < tam_aplicacao; i++)
        {
            printf ("%.20f\n" , _input);
        }

        timer->Reset();

        double output_teste[12];

        timer->Start();
        while (t <= t_final)
        {
            calcular_eq_calor(inputStream,outputStream,beta,inverted_beta,qtd_loop);
            atualizar_input(outputStream,inputStream);

            m++;
            t = m * dt;
        }
        timer->Stop();
        cout<<"\nPassaram "<<timer->GetElapsedTime()<<" unidades de tempo no execucao na GPU\n";

        outputStream.write(output_teste);

        printf ("\n\nResultado FINAL\n\n");
        for (int i = 0 ; i < tam_aplicacao; i++)
        {
            printf ("%.20f\n" , output_teste);
        }

        // Handle errors if any
        if(outputStream.error())
        {
            std::cout << "Error occured" << std::endl;
            std::cout << outputStream.errorLog() << std::endl;
        }
    }

    /////////////////////////////////////////////////////////////////////////
    // Print results
    /////////////////////////////////////////////////////////////////////////


    system("pause");

    return true;
}

The error that i get is :

>ERROR: ASSERT(func.NumArgs() == (int)mChildElements.size()) failed
1>While processing <buffer>:196
1>In compiler at AST::ArgList::LoadFunctionArgs()[astcontrol.cpp:426]
1> func.NumArgs() = 6
1> mChildElements.size() = 5
1>Message: Invalid function call
1>Aborting...

gaurav_garg · ‎09-30-2009

The compele stream cannot be passed to sub-kernel. Similarly, usage of instance() intrinsics is not allowed inside sub-kernel.

dabrunhosa · ‎09-30-2009

Can i create a 2D Stream and pass a single line to the sub-kernel ? How I can pass a single line ?

Thanks for your help.

dabrunhosa · ‎09-30-2009

The Kernel above does work :

kernel void calcular_eq_calor(float3 input<>, out float output<>, float beta, float inverted_beta, float qtd_loop)
{
    float m ,beta_in;
    float t;

    int i = instance().x;

    beta_in = 2.0f - beta;

    if(input.y == 0.0f)
        output = 0.0f;
    else
    {
        if(qtd_loop < (float)i)
            output = 0.0f;
        else
            output = (input.x - (beta_in * input.y) + input.z) * inverted_beta;
    }
}

kernel void atualizar_input(float input[], float qtd_loop,out float3 output<>
{
    int i = instance().x;
    int j = i - 1;
    int l = i + 1;

    if(input == 0.0f)
        output = float3(0.0f,0.0f,0.0f);
    else
    {
        if(qtd_loop < (float)i)
            output = float3(0.0f,0.0f,0.0f);
        else
        {
            output.x = input;
            output.y = input;
            output.z = input;
        }
    }
}

The host Code :

while (t <= t_final)
        {
            calcular_eq_calor(inputStream,outputStream,beta,inverted_beta,qtd_loop);
            atualizar_input(outputStream,qtd_loop,inputStream);

            m++;
            t = m * dt;
        }

I am trying to pass the host code to the gpu because if I compare the performance of the gpu vs the cpu, the gpu is 1000X slower. I think it's because i calculate one line and return to the cpu, so I have to waste time with the data transfer of the gpu to the cpu.

Archives Discussions

Calling Gather Kernel from Parent Kernel