I'm currently having a problem when I call a gather kernel from another kernel. The .br code follow below:
kernel void calcular_eq_calor(double input[], out double output<>, double beta, double inverted_beta, double qtd_loop)
{
double m ,beta_in;
double t;
int i = instance().x;
int j = i + 1;
int l = i -1;
beta_in = 2.0 - beta;
t = 0.0;
m = 0.0;
if(input == 0.0)
output = 0.0;
else
{
if(qtd_loop < (double)i)
output = 0.0;
else
output = (input
}
}
kernel void teste(out double input<>, out double output<>, double beta, double inverted_beta, double qtd_loop)
{
calcular_eq_calor(input,output,beta,inverted_beta,qtd_loop);
calcular_eq_calor(output,input,beta,inverted_beta,qtd_loop);
}
I want to call the gather kernel to calculate all the elements in input, and then i want to get the output of this kernel and use as the input of the next call.
The host code follows below :
#include "Eq_Calor.h"
#include "brookgenfiles/eq_calor_gpu.h"
CPerfCounter* timer;
//Para a equação do calor ser estável a equação a seguir precisa
// ser satisfeita : dt <= (dx*dx)/(2*alpha)
const double dt = 0.00005; // Delta tempo
const double dx = 0.1; // Delta x
const double t_final = 1;
const double x_final = 1;
const double alpha = 1;
int nx = (int)(1/dx) + 1;
double beta = (alpha*dx*dx)/dt;
double _input[100000000] , _input2[100000000];
int _length;
float _count;
double* _output_gpu;
double* _input_gpu;
float _result;
Eq_Calor::Eq_Calor(int num_processos)
{
_input_gpu = NULL;
_output_gpu = NULL;
_result = 0.0f;
_length = num_processos;
_count = 0.0f;
}
////////////////////////////////////////////////////////////////////////////////
//
// \Equal CPU Code
//
////////////////////////////////////////////////////////////////////////////////
void eq_calor_cpu()
{
long int i , m;
double x , t;
t = 0;
m = 0;
while (t <= t_final)
{
// ======================== BEGIN ========================
i = 1;
x = (double) i*dx;
while (x < x_final)
{
// ============== BEGIN =============
_input2 = (_input[i+1] - (2 - beta)*_input + _input[i-1])/beta;
// =============== END ===============
i++;
x = (double) i*dx;
}
// Passar os novos valores para o vetor U
for (i = 1 ; i < nx; i++)
{
_input = _input2;
}
// ========================= END =========================
m++;
t = (double) m*dt;
}
}
void PreencherStream()
{
_input_gpu = (double*) malloc(sizeof(double));
_output_gpu = (double*) malloc(sizeof(double));
double x;
for(int i = 0; i< nx; i++)
{
x = (double) i*dx;
_input = exp(-((x - 0.5)*(x - 0.5))/(0.01));
_input_gpu = exp(-((x - 0.5)*(x - 0.5))/(0.01));
}
// Condicao de contorno de DIRICHLET
_input[0] = 0;
_input[nx] = 0;
_input_gpu[0] = 0;
_input_gpu[nx] = 0;
}
////////////////////////////////////////////////////////////////////////////////
//!
//! \brief backend implementation for the sample
//!
////////////////////////////////////////////////////////////////////////////////
bool Eq_Calor::run()
{
unsigned int retVal = 0;
timer = new CPerfCounter();
double t = 0;
double m = 0;
double inverted_beta = 1 / beta;
double tam_aplicacao = nx + 1;
double qtd_loop = (x_final / dx) - 1;
/////////////////////////////////////////////////////////////////////////
// Brook code block
/////////////////////////////////////////////////////////////////////////
{
unsigned int dim[] = {tam_aplicacao};
::brook::Stream<double> inputStream(1, dim);
::brook::Stream<double> outputStream(1, dim);
PreencherStream();
inputStream.read(_input_gpu);
printf ("\n\nCondicao Inicial\n\n");
for (int i = 0 ; i< tam_aplicacao; i++)
{
printf ("%.20f\n" , _input);
}
timer->Start();
eq_calor_cpu();
timer->Stop();
cout<<"\nPassaram "<<timer->GetElapsedTime()<<" unidades de tempo no execucao na CPU";
printf ("\n\nResultado FINAL\n\n");
for (int i = 0 ; i < tam_aplicacao; i++)
{
printf ("%.20f\n" , _input);
}
timer->Reset();
double output_teste[12];
timer->Start();
while (t <= t_final)
{
calcular_eq_calor(inputStream,outputStream,beta,inverted_beta,qtd_loop);
atualizar_input(outputStream,inputStream);
m++;
t = m * dt;
}
timer->Stop();
cout<<"\nPassaram "<<timer->GetElapsedTime()<<" unidades de tempo no execucao na GPU\n";
outputStream.write(output_teste);
printf ("\n\nResultado FINAL\n\n");
for (int i = 0 ; i < tam_aplicacao; i++)
{
printf ("%.20f\n" , output_teste);
}
// Handle errors if any
if(outputStream.error())
{
std::cout << "Error occured" << std::endl;
std::cout << outputStream.errorLog() << std::endl;
}
}
/////////////////////////////////////////////////////////////////////////
// Print results
/////////////////////////////////////////////////////////////////////////
system("pause");
return true;
}
The error that i get is :
>ERROR: ASSERT(func.NumArgs() == (int)mChildElements.size()) failed
1>While processing <buffer>:196
1>In compiler at AST::ArgList::LoadFunctionArgs()[astcontrol.cpp:426]
1> func.NumArgs() = 6
1> mChildElements.size() = 5
1>Message: Invalid function call
1>Aborting...
The compele stream cannot be passed to sub-kernel. Similarly, usage of instance() intrinsics is not allowed inside sub-kernel.
Can i create a 2D Stream and pass a single line to the sub-kernel ? How I can pass a single line ?
Thanks for your help.
The Kernel above does work :
kernel void calcular_eq_calor(float3 input<>, out float output<>, float beta, float inverted_beta, float qtd_loop)
{
float m ,beta_in;
float t;
int i = instance().x;
beta_in = 2.0f - beta;
if(input.y == 0.0f)
output = 0.0f;
else
{
if(qtd_loop < (float)i)
output = 0.0f;
else
output = (input.x - (beta_in * input.y) + input.z) * inverted_beta;
}
}
kernel void atualizar_input(float input[], float qtd_loop,out float3 output<>
{
int i = instance().x;
int j = i - 1;
int l = i + 1;
if(input == 0.0f)
output = float3(0.0f,0.0f,0.0f);
else
{
if(qtd_loop < (float)i)
output = float3(0.0f,0.0f,0.0f);
else
{
output.x = input
output.y = input
output.z = input
}
}
}
The host Code :
while (t <= t_final)
{
calcular_eq_calor(inputStream,outputStream,beta,inverted_beta,qtd_loop);
atualizar_input(outputStream,qtd_loop,inputStream);
m++;
t = m * dt;
}
I am trying to pass the host code to the gpu because if I compare the performance of the gpu vs the cpu, the gpu is 1000X slower. I think it's because i calculate one line and return to the cpu, so I have to waste time with the data transfer of the gpu to the cpu.