So let me better illuminate the situation, oddly the kernel seems to do NOTHING (produces the same results if I eliminate the equivalent CPU code and get results), even if I get rid of all the "if" statements.
here is some CPU code:
for(y=bk;y<=my-bk+1;y++)
{
F1to4[0+gx*y].w = F1to4[(mx-1)+gx*y].w;
F1to4[0+gx*y].x = F1to4[(mx-1)+gx*y].x;
F1to4[0+gx*y].y = F1to4[(mx-1)+gx*y].y;
F1to4[0+gx*y].z = F1to4[(mx-1)+gx*y].z;
F5to8[0+gx*y].w = F5to8[(mx-1)+gx*y].w;
F5to8[0+gx*y].x = F5to8[(mx-1)+gx*y].x;
F5to8[0+gx*y].y = F5to8[(mx-1)+gx*y].y;
F5to8[0+gx*y].z = F5to8[(mx-1)+gx*y].z;
F9[0+gx*y].w = F9[(mx-1)+gx*y].w;
F1to4[(mx+1)+gx*y].w = F1to4[2+gx*y].w;
F1to4[(mx+1)+gx*y].x = F1to4[2+gx*y].x;
F1to4[(mx+1)+gx*y].y = F1to4[2+gx*y].y;
F1to4[(mx+1)+gx*y].z = F1to4[2+gx*y].z;
F5to8[(mx+1)+gx*y].w = F5to8[2+gx*y].w;
F5to8[(mx+1)+gx*y].x = F5to8[2+gx*y].x;
F5to8[(mx+1)+gx*y].y = F5to8[2+gx*y].y;
F5to8[(mx+1)+gx*y].z = F5to8[2+gx*y].z;
F9[(mx+1)+gx*y].w = F9[2+gx*y].w;
}
NOW, I am trying to do the same thing in Brook+ kernel:
kernel void advection2_s(float4 Fin1to4[], float4 Fin5to8[], float4 Fin9[], int gx, int gy,
int mx, int my, int bk, out float4 Fs9<>, out float4 Fs5to8<>, out float4 Fs1to4<>)
{
int x, y, idx;
idx = indexof(Fs1to4);
x = (int)fmod((float)idx, (float)gx);
y = (int)floor((float)idx/(float)gx);
if ((y > (bk-1)) && (y <= (my-bk+1)))
{
if (idx == gx*y)
{
Fs1to4 = Fin1to4[(mx-1)+gx*y];
//Fs1to4.x = Fin1to4[(mx-1)+gx*y].x;
//Fs1to4.y = Fin1to4[(mx-1)+gx*y].y;
//Fs1to4.z = Fin1to4[(mx-1)+gx*y].z;
Fs5to8 = Fin5to8[(mx-1)+gx*y];
//Fs5to8.x = Fin5to8[(mx-1)+gx*y].x;
//Fs5to8.y = Fin5to8[(mx-1)+gx*y].y;
//Fs5to8.z = Fin5to8[(mx-1)+gx*y].z;
Fs9.w = Fin9[(mx-1)+gx*y].w;
}
if (idx == (mx+1)+gx*y)
{
Fs1to4 = Fin1to4[2+gx*y];
//Fs1to4.x = Fin1to4[2+gx*y].x;
//Fs1to4.y = Fin1to4[2+gx*y].y;
//Fs1to4.z = Fin1to4[2+gx*y].z;
Fs5to8 = Fin5to8[2+gx*y];
//Fs5to8.x = Fin5to8[2+gx*y].x;
//Fs5to8.y = Fin5to8[2+gx*y].y;
//Fs5to8.z = Fin5to8[2+gx*y].z;
Fs9.w = Fin9[2+gx*y].w;
}
}
}
and the kernel call:
advection2_s(Fs1to4, Fs5to8, Fs9, gx, gy, mx, my, bk, Fs9, Fs5to8, Fs1to4);
All streams are the same size.