# Memory Leak or what? Help

**ryta1203**Jan 13, 2009 2:10 PM

1. I have my code below. Is there a known memory leak in Brook+? I am getting a memory leak (increased memory usage with every iteration) from the code below. The program gets significantly slower with every passing 100 iterations (I notice because I am printing out the results every 100 iterations) and really starts to bog down around 1500-2000 iterations. My program needs to run ~84000 iterations. Is there some memory leak I am missing or what? Does brook create an instance of the stream every time the kernel is called if the stream is called? If the stream is not written back then this might cause a memory leak. Any of you Brook+ guys have any ideas?

2. Also, the performance is VERY slow compared to a VERY similar GPU solution using CUDA. My question here is this: Will AMD/ATI be releasing a profiler for their SDK? The "Shader Analyzer" is very limited and simply not that user friendly/readable.

3. The "copy..." streams are needed since there is no bidirectional streams allowed.

Here is my code:

void mcollid(float4 F1to4[], float4 F5to8[], float4 F9[], float4 f1to4[], float4 f5to8[], float4 f9[], float s[], float GEO[], float G, int gx, int gy,

int e[], int r[], int mx, int my, int bk, float p[], float u[], float v[], float p1[], float u1[], float v1[], float ERR)

{

int size = gx*gy, step, x=0, y=0; float Norm1, Norm2, error1, error2;

float4 Fs1to4_1 < size >;

float4 Fs1to4_2 < size >;

float4 Fs5to8_1 < size >;

float4 Fs5to8_2< size >;

float4 Fs9_1< size >;

float4 Fs9_2< size > ;

float4 fs1to4_1< size >;

float4 fs5to8_1< size >;

float4 fs9_1< size >;

float4 fs1to4_2< size >;

float4 fs5to8_2< size >;

float4 fs9_2< size >;

float GEOs< size >;

float ss<9>;

float rs<9>;

int es< 18 >;

float ps_1< size >;

float us_1< size >;

float vs_1< size >;

float ps_2< size >;

float us_2< size >;

float vs_2< size >;

streamRead(ps_1, p);

streamRead(us_1, u);

streamRead(vs_1, v);

streamRead(fs1to4_1, f1to4);

streamRead(fs5to8_1, f5to8);

streamRead(fs9_1, f9);

streamRead(Fs1to4_1, F1to4);

streamRead(Fs5to8_1, F5to8);

streamRead(Fs9_1, F9);

streamRead(ss, s);

streamRead(GEOs, GEO);

streamRead(rs, r);

streamRead(es, e);

step = 1;

Norm1=1.0;

Norm2=1.0;

error1=1.0; // Init L2-Norm error for velocity

error2=1.0; // Init L2-Norm error for density

while (error1> ERR)

{

mcollid_s(Fs1to4_1, Fs5to8_1, Fs9_1, fs1to4_1, fs5to8_1, fs9_1, GEOs, ss, G, Fs9_2, Fs5to8_2, Fs1to4_2);

advection1_s(Fs1to4_2, Fs5to8_2, Fs9_2, GEOs, es, gx, gy, mx, my, rs, Fs9_1, Fs5to8_1, Fs1to4_1);

advection2_s(Fs1to4_1, Fs5to8_1, Fs9_1, gx, gy, mx, my, bk, Fs9_2, Fs5to8_2, Fs1to4_2);

advection3_s(Fs1to4_2, Fs5to8_2, Fs9_2, gx, gy, mx, my, bk, Fs9_1, Fs5to8_1, Fs1to4_1);

stream_s(Fs1to4_1, Fs5to8_1, Fs9_1, fs1to4_1, fs5to8_1, fs9_1, es, gx, gy, mx, my, fs9_2, fs5to8_2, fs1to4_2);

//streamWrite(Fs1to4_1, F1to4);

//streamWrite(Fs5to8_1, F5to8);

//streamWrite(Fs9_1, F9);

//streamWrite(fs9_2, f9);

//streamWrite(fs5to8_2, f5to8);

//streamWrite(fs1to4_2, f1to4);

macro_s(fs1to4_2, fs5to8_2, fs9_2, ps_1, us_1, vs_1, es, GEOs, ps_2, us_2, vs_2);

copypuv_s(ps_2, us_2, vs_2, ps_1, us_1, vs_1);

streamWrite(ps_2, p);

streamWrite(vs_2, v);

streamWrite(us_2, u);

copyf_s(fs1to4_2, fs5to8_2, fs9_2, fs9_1, fs5to8_1, fs1to4_1);

if (step%100 == 0 || step < 100)

{

Norm1=0.0f;

Norm2=0.0f;

for (x=1;x<=mx;x++)

{

for (y=1;y<=my;y++)

{

Norm1+=(u[x+gx*y]-u1[x+gx*y])*(u[x+gx*y]-u1[x+gx*y])+(v[x+gx*y]-v1[x+gx*y])*(v[x+gx*y]-v1[x+gx*y]);

error1+=u[x+gx*y]*u[x+gx*y]+v[x+gx*y]*v[x+gx*y];

Norm2+=(p1[x+gx*y]-p[x+gx*y])*(p1[x+gx*y]-p[x+gx*y]);

error2+=p[x+gx*y]*p[x+gx*y];

}

}

error1=sqrt(Norm1/error1);

error2=sqrt(Norm2/error2);

printf("error1=%e error2=%e m=%d\n\n",error1, error2,step);

for(x=1;x<=mx;x++)

{

for(y=1;y<=my;y++)

{

u1[x+gx*y]=u[x+gx*y];

v1[x+gx*y]=v[x+gx*y];

p1[x+gx*y]=p[x+gx*y];

}

}

}

step++;

} //end of while (error1>ERR)

}

2. Also, the performance is VERY slow compared to a VERY similar GPU solution using CUDA. My question here is this: Will AMD/ATI be releasing a profiler for their SDK? The "Shader Analyzer" is very limited and simply not that user friendly/readable.

3. The "copy..." streams are needed since there is no bidirectional streams allowed.

Here is my code:

void mcollid(float4 F1to4[], float4 F5to8[], float4 F9[], float4 f1to4[], float4 f5to8[], float4 f9[], float s[], float GEO[], float G, int gx, int gy,

int e[], int r[], int mx, int my, int bk, float p[], float u[], float v[], float p1[], float u1[], float v1[], float ERR)

{

int size = gx*gy, step, x=0, y=0; float Norm1, Norm2, error1, error2;

float4 Fs1to4_1 < size >;

float4 Fs1to4_2 < size >;

float4 Fs5to8_1 < size >;

float4 Fs5to8_2< size >;

float4 Fs9_1< size >;

float4 Fs9_2< size > ;

float4 fs1to4_1< size >;

float4 fs5to8_1< size >;

float4 fs9_1< size >;

float4 fs1to4_2< size >;

float4 fs5to8_2< size >;

float4 fs9_2< size >;

float GEOs< size >;

float ss<9>;

float rs<9>;

int es< 18 >;

float ps_1< size >;

float us_1< size >;

float vs_1< size >;

float ps_2< size >;

float us_2< size >;

float vs_2< size >;

streamRead(ps_1, p);

streamRead(us_1, u);

streamRead(vs_1, v);

streamRead(fs1to4_1, f1to4);

streamRead(fs5to8_1, f5to8);

streamRead(fs9_1, f9);

streamRead(Fs1to4_1, F1to4);

streamRead(Fs5to8_1, F5to8);

streamRead(Fs9_1, F9);

streamRead(ss, s);

streamRead(GEOs, GEO);

streamRead(rs, r);

streamRead(es, e);

step = 1;

Norm1=1.0;

Norm2=1.0;

error1=1.0; // Init L2-Norm error for velocity

error2=1.0; // Init L2-Norm error for density

while (error1> ERR)

{

mcollid_s(Fs1to4_1, Fs5to8_1, Fs9_1, fs1to4_1, fs5to8_1, fs9_1, GEOs, ss, G, Fs9_2, Fs5to8_2, Fs1to4_2);

advection1_s(Fs1to4_2, Fs5to8_2, Fs9_2, GEOs, es, gx, gy, mx, my, rs, Fs9_1, Fs5to8_1, Fs1to4_1);

advection2_s(Fs1to4_1, Fs5to8_1, Fs9_1, gx, gy, mx, my, bk, Fs9_2, Fs5to8_2, Fs1to4_2);

advection3_s(Fs1to4_2, Fs5to8_2, Fs9_2, gx, gy, mx, my, bk, Fs9_1, Fs5to8_1, Fs1to4_1);

stream_s(Fs1to4_1, Fs5to8_1, Fs9_1, fs1to4_1, fs5to8_1, fs9_1, es, gx, gy, mx, my, fs9_2, fs5to8_2, fs1to4_2);

//streamWrite(Fs1to4_1, F1to4);

//streamWrite(Fs5to8_1, F5to8);

//streamWrite(Fs9_1, F9);

//streamWrite(fs9_2, f9);

//streamWrite(fs5to8_2, f5to8);

//streamWrite(fs1to4_2, f1to4);

macro_s(fs1to4_2, fs5to8_2, fs9_2, ps_1, us_1, vs_1, es, GEOs, ps_2, us_2, vs_2);

copypuv_s(ps_2, us_2, vs_2, ps_1, us_1, vs_1);

streamWrite(ps_2, p);

streamWrite(vs_2, v);

streamWrite(us_2, u);

copyf_s(fs1to4_2, fs5to8_2, fs9_2, fs9_1, fs5to8_1, fs1to4_1);

if (step%100 == 0 || step < 100)

{

Norm1=0.0f;

Norm2=0.0f;

for (x=1;x<=mx;x++)

{

for (y=1;y<=my;y++)

{

Norm1+=(u[x+gx*y]-u1[x+gx*y])*(u[x+gx*y]-u1[x+gx*y])+(v[x+gx*y]-v1[x+gx*y])*(v[x+gx*y]-v1[x+gx*y]);

error1+=u[x+gx*y]*u[x+gx*y]+v[x+gx*y]*v[x+gx*y];

Norm2+=(p1[x+gx*y]-p[x+gx*y])*(p1[x+gx*y]-p[x+gx*y]);

error2+=p[x+gx*y]*p[x+gx*y];

}

}

error1=sqrt(Norm1/error1);

error2=sqrt(Norm2/error2);

printf("error1=%e error2=%e m=%d\n\n",error1, error2,step);

for(x=1;x<=mx;x++)

{

for(y=1;y<=my;y++)

{

u1[x+gx*y]=u[x+gx*y];

v1[x+gx*y]=v[x+gx*y];

p1[x+gx*y]=p[x+gx*y];

}

}

}

step++;

} //end of while (error1>ERR)

}