1. I have my code below. Is there a known memory leak in Brook+? I am getting a memory leak (increased memory usage with every iteration) from the code below. The program gets significantly slower with every passing 100 iterations (I notice because I am printing out the results every 100 iterations) and really starts to bog down around 1500-2000 iterations. My program needs to run ~84000 iterations. Is there some memory leak I am missing or what? Does brook create an instance of the stream every time the kernel is called if the stream is called? If the stream is not written back then this might cause a memory leak. Any of you Brook+ guys have any ideas?

2. Also, the performance is VERY slow compared to a VERY similar GPU solution using CUDA. My question here is this: Will AMD/ATI be releasing a profiler for their SDK? The "Shader Analyzer" is very limited and simply not that user friendly/readable.

3. The "copy..." streams are needed since there is no bidirectional streams allowed.

Here is my code:

void mcollid(float4 F1to4[], float4 F5to8[], float4 F9[], float4 f1to4[], float4 f5to8[], float4 f9[], float s[], float GEO[], float G, int gx, int gy,

int e[], int r[], int mx, int my, int bk, float p[], float u[], float v[], float p1[], float u1[], float v1[], float ERR)

{

int size = gx*gy, step, x=0, y=0; float Norm1, Norm2, error1, error2;

float4 Fs1to4_1 < size >;

float4 Fs1to4_2 < size >;

float4 Fs5to8_1 < size >;

float4 Fs5to8_2< size >;

float4 Fs9_1< size >;

float4 Fs9_2< size > ;

float4 fs1to4_1< size >;

float4 fs5to8_1< size >;

float4 fs9_1< size >;

float4 fs1to4_2< size >;

float4 fs5to8_2< size >;

float4 fs9_2< size >;

float GEOs< size >;

float ss<9>;

float rs<9>;

int es< 18 >;

float ps_1< size >;

float us_1< size >;

float vs_1< size >;

float ps_2< size >;

float us_2< size >;

float vs_2< size >;

streamRead(ps_1, p);

streamRead(us_1, u);

streamRead(vs_1, v);

streamRead(fs1to4_1, f1to4);

streamRead(fs5to8_1, f5to8);

streamRead(fs9_1, f9);

streamRead(Fs1to4_1, F1to4);

streamRead(Fs5to8_1, F5to8);

streamRead(Fs9_1, F9);

streamRead(ss, s);

streamRead(GEOs, GEO);

streamRead(rs, r);

streamRead(es, e);

step = 1;

Norm1=1.0;

Norm2=1.0;

error1=1.0; // Init L2-Norm error for velocity

error2=1.0; // Init L2-Norm error for density

while (error1> ERR)

{

mcollid_s(Fs1to4_1, Fs5to8_1, Fs9_1, fs1to4_1, fs5to8_1, fs9_1, GEOs, ss, G, Fs9_2, Fs5to8_2, Fs1to4_2);

advection1_s(Fs1to4_2, Fs5to8_2, Fs9_2, GEOs, es, gx, gy, mx, my, rs, Fs9_1, Fs5to8_1, Fs1to4_1);

advection2_s(Fs1to4_1, Fs5to8_1, Fs9_1, gx, gy, mx, my, bk, Fs9_2, Fs5to8_2, Fs1to4_2);

advection3_s(Fs1to4_2, Fs5to8_2, Fs9_2, gx, gy, mx, my, bk, Fs9_1, Fs5to8_1, Fs1to4_1);

stream_s(Fs1to4_1, Fs5to8_1, Fs9_1, fs1to4_1, fs5to8_1, fs9_1, es, gx, gy, mx, my, fs9_2, fs5to8_2, fs1to4_2);

//streamWrite(Fs1to4_1, F1to4);

//streamWrite(Fs5to8_1, F5to8);

//streamWrite(Fs9_1, F9);

//streamWrite(fs9_2, f9);

//streamWrite(fs5to8_2, f5to8);

//streamWrite(fs1to4_2, f1to4);

macro_s(fs1to4_2, fs5to8_2, fs9_2, ps_1, us_1, vs_1, es, GEOs, ps_2, us_2, vs_2);

copypuv_s(ps_2, us_2, vs_2, ps_1, us_1, vs_1);

streamWrite(ps_2, p);

streamWrite(vs_2, v);

streamWrite(us_2, u);

copyf_s(fs1to4_2, fs5to8_2, fs9_2, fs9_1, fs5to8_1, fs1to4_1);

if (step%100 == 0 || step < 100)

{

Norm1=0.0f;

Norm2=0.0f;

for (x=1;x<=mx;x++)

{

for (y=1;y<=my;y++)

{

Norm1+=(u[x+gx*y]-u1[x+gx*y])*(u[x+gx*y]-u1[x+gx*y])+(v[x+gx*y]-v1[x+gx*y])*(v[x+gx*y]-v1[x+gx*y]);

error1+=u[x+gx*y]*u[x+gx*y]+v[x+gx*y]*v[x+gx*y];

Norm2+=(p1[x+gx*y]-p[x+gx*y])*(p1[x+gx*y]-p[x+gx*y]);

error2+=p[x+gx*y]*p[x+gx*y];

}

}

error1=sqrt(Norm1/error1);

error2=sqrt(Norm2/error2);

printf("error1=%e error2=%e m=%d\n\n",error1, error2,step);

for(x=1;x<=mx;x++)

{

for(y=1;y<=my;y++)

{

u1[x+gx*y]=u[x+gx*y];

v1[x+gx*y]=v[x+gx*y];

p1[x+gx*y]=p[x+gx*y];

}

}

}

step++;

} //end of while (error1>ERR)

}

2. Also, the performance is VERY slow compared to a VERY similar GPU solution using CUDA. My question here is this: Will AMD/ATI be releasing a profiler for their SDK? The "Shader Analyzer" is very limited and simply not that user friendly/readable.

3. The "copy..." streams are needed since there is no bidirectional streams allowed.

Here is my code:

void mcollid(float4 F1to4[], float4 F5to8[], float4 F9[], float4 f1to4[], float4 f5to8[], float4 f9[], float s[], float GEO[], float G, int gx, int gy,

int e[], int r[], int mx, int my, int bk, float p[], float u[], float v[], float p1[], float u1[], float v1[], float ERR)

{

int size = gx*gy, step, x=0, y=0; float Norm1, Norm2, error1, error2;

float4 Fs1to4_1 < size >;

float4 Fs1to4_2 < size >;

float4 Fs5to8_1 < size >;

float4 Fs5to8_2< size >;

float4 Fs9_1< size >;

float4 Fs9_2< size > ;

float4 fs1to4_1< size >;

float4 fs5to8_1< size >;

float4 fs9_1< size >;

float4 fs1to4_2< size >;

float4 fs5to8_2< size >;

float4 fs9_2< size >;

float GEOs< size >;

float ss<9>;

float rs<9>;

int es< 18 >;

float ps_1< size >;

float us_1< size >;

float vs_1< size >;

float ps_2< size >;

float us_2< size >;

float vs_2< size >;

streamRead(ps_1, p);

streamRead(us_1, u);

streamRead(vs_1, v);

streamRead(fs1to4_1, f1to4);

streamRead(fs5to8_1, f5to8);

streamRead(fs9_1, f9);

streamRead(Fs1to4_1, F1to4);

streamRead(Fs5to8_1, F5to8);

streamRead(Fs9_1, F9);

streamRead(ss, s);

streamRead(GEOs, GEO);

streamRead(rs, r);

streamRead(es, e);

step = 1;

Norm1=1.0;

Norm2=1.0;

error1=1.0; // Init L2-Norm error for velocity

error2=1.0; // Init L2-Norm error for density

while (error1> ERR)

{

mcollid_s(Fs1to4_1, Fs5to8_1, Fs9_1, fs1to4_1, fs5to8_1, fs9_1, GEOs, ss, G, Fs9_2, Fs5to8_2, Fs1to4_2);

advection1_s(Fs1to4_2, Fs5to8_2, Fs9_2, GEOs, es, gx, gy, mx, my, rs, Fs9_1, Fs5to8_1, Fs1to4_1);

advection2_s(Fs1to4_1, Fs5to8_1, Fs9_1, gx, gy, mx, my, bk, Fs9_2, Fs5to8_2, Fs1to4_2);

advection3_s(Fs1to4_2, Fs5to8_2, Fs9_2, gx, gy, mx, my, bk, Fs9_1, Fs5to8_1, Fs1to4_1);

stream_s(Fs1to4_1, Fs5to8_1, Fs9_1, fs1to4_1, fs5to8_1, fs9_1, es, gx, gy, mx, my, fs9_2, fs5to8_2, fs1to4_2);

//streamWrite(Fs1to4_1, F1to4);

//streamWrite(Fs5to8_1, F5to8);

//streamWrite(Fs9_1, F9);

//streamWrite(fs9_2, f9);

//streamWrite(fs5to8_2, f5to8);

//streamWrite(fs1to4_2, f1to4);

macro_s(fs1to4_2, fs5to8_2, fs9_2, ps_1, us_1, vs_1, es, GEOs, ps_2, us_2, vs_2);

copypuv_s(ps_2, us_2, vs_2, ps_1, us_1, vs_1);

streamWrite(ps_2, p);

streamWrite(vs_2, v);

streamWrite(us_2, u);

copyf_s(fs1to4_2, fs5to8_2, fs9_2, fs9_1, fs5to8_1, fs1to4_1);

if (step%100 == 0 || step < 100)

{

Norm1=0.0f;

Norm2=0.0f;

for (x=1;x<=mx;x++)

{

for (y=1;y<=my;y++)

{

Norm1+=(u[x+gx*y]-u1[x+gx*y])*(u[x+gx*y]-u1[x+gx*y])+(v[x+gx*y]-v1[x+gx*y])*(v[x+gx*y]-v1[x+gx*y]);

error1+=u[x+gx*y]*u[x+gx*y]+v[x+gx*y]*v[x+gx*y];

Norm2+=(p1[x+gx*y]-p[x+gx*y])*(p1[x+gx*y]-p[x+gx*y]);

error2+=p[x+gx*y]*p[x+gx*y];

}

}

error1=sqrt(Norm1/error1);

error2=sqrt(Norm2/error2);

printf("error1=%e error2=%e m=%d\n\n",error1, error2,step);

for(x=1;x<=mx;x++)

{

for(y=1;y<=my;y++)

{

u1[x+gx*y]=u[x+gx*y];

v1[x+gx*y]=v[x+gx*y];

p1[x+gx*y]=p[x+gx*y];

}

}

}

step++;

} //end of while (error1>ERR)

}

1) If you have a standalone test case that shows this leak, can you email it to streamdeveloper@amd.com with a little explanation. That way we can quickly track down the issue and get back to you on a possible solution before the next release.

2) You might want to email the shader analyzer team about this as they are the ones that would be working on those tools. Also, if you can supply them with what could be improved, I am sure they would be very grateful.

The only performance analyzing stuff that we have public right now is the shader analyzer and some equations that were given by Justin Hensley and Jason Yang during their UCF course. I know you probably aren't going to like the solution, but it involves understanding the ISA and modifying the brook+ code to produce better ISA.

For example this chunk of code, if it was kernel code:

{

Norm1+=(u[x+gx*y]-u1[x+gx*y])*(u[x+gx*y]-u1[x+gx*y])+(v[x+gx*y]-v1[x+gx*y])*(v[x+gx*y]-v1[x+gx*y]);

error1+=u[x+gx*y]*u[x+gx*y]+v[x+gx*y]*v[x+gx*y];

Norm2+=(p1[x+gx*y]-p[x+gx*y])*(p1[x+gx*y]-p[x+gx*y]);

error2+=p[x+gx*y]*p[x+gx*y];

}

Would probably perform better if written as:

{

float4 ud, u1d, vd, v1d, pd, p1d;

int idx = x + gx * y;

ud = u[idx];

u1d = u1[idx];

vd = v[idx];

v1d = v1[idx];

pd = p[idx];

p1d = p1[idx];

float4 t1, t2;

t1 = ud - u1d;

t2 = vd - v1d;

Norm1 += ((t1 * t1) + (t2 * t2));

error1 += ((ud * ud) + (vd * vd));

Norm2 += ((p1d - pd) * (p1d - pd));

error2 += (pd * pd);

}

If this was kernel code, you would have 3x fewer memory accesses.

The main problem why you need to program in this manner on the GPU is the compilers for our GPUs are relatively new compared to their CPU equivalents. The CPU compilers know how to optimize the code to reduce the number of memory reads from the p/p1/u/u1/v/v1 arrays, but our GPU compilers might currently miss this optimization. Which produces less efficient code.

Hope this helps,

1. I don't have a standalone case and I think it would take quite some time to replicate this since I would have to find the problem in order to replicate it. Isn't that the job of the testers? Are the testers not interested in something that is broken unless I can simplify and find the reason for them?

2. I will let them know, thanks.

3. I understand that Brook+ Complier is very new and that's fine. Your solution is exactly what I plan on doing. I imagine a deal of this slow down can be attributed to the memory leak. I just thought that the unoptimized GPU solution would still get some improvement over the CPU solution when compared to the unoptimized CUDA solution, which is essentially the same code here (the kernels are also very similar although CUDA isn't limited by the Stream Model). The unoptimized CUDA solution produced speedups of 100x while this WORKING Brook+ version is at least twice as slow as the CPU version. It's a very interesting and quite different results considering.

To resolve the slow-down issue, call Stream::error method on any one output stream after each kernel invocation. Something like-

kernelCall(input, output);

output.error();

You can write something in ping-pong fashion. Or you can set environment variable BRT_PERMIT_READ_WRITE_ALIASING=1. It allows you to use the same stream as input and output, but it can produce undefined results in case your input/output streams are gather/scatter.

Thanks.

1. The "error" and "errorLog" functions produce nothing. There is no "error" in the stream, since the "if (output.error())" never takes (ie. is never true).

2. I did write it in ping-pong fashion no? The problem is that the ping-pong only works without a copy across iterations where the input for the next loop is the output from the last loop if you are doing it twice:

func1(a, b, c, out1, out2, out3)

func2(out1, out2, out3, a, b, c)

In this scenario when you iterate over these two functions there is no problem, BUT if you don't:

func1(a, b, c, out1, out2, out3)

Then the output from the last loop is not going to be the input to the next loop unless you have an additional kernel (more unneeded code):

func1(a, b, c, out1, out2, out3)

copy(out1, out2, out3, a, b, c)

Correct? Please let me know if this is not true and I am doing something wrong because I see no other way around it.

The BRT_PERMIT_READ_WRITE_ALIASING flag provides incorrect results for my solution and it was suggested by many that this not be used so I didn't use it.

Using stream.error() is just to overcome slowdown issue. Please see this-

http://forums.amd.com/forum/messageview.cfm?catid=328&threadid=105386&highlight_key=y

There is a bug in caching implementation for execution events. There is no overhead of this function. It just makes sure the kernel execution is complete. This problem will be resolved in next version.

Gaurav,

Thank you. I assume we can look forward to the next version ~ 2 months?

This is keeping with the "every 3 months" time frame mentioned before, yes?

This is just speculation since I don't have the ISA to see exactly what is causing the performance degredation, but if the code is written like the original part of my example, then the shaders are bandwidth bound and I would not expect them to outperform the CPU.

Micah

Micah,

Thanks again, maybe I can use the GSA to find this out. I will give this a try, thanks.