licoah

How to optimize the kernel with Brook+

Discussion created by licoah on Nov 5, 2009
Latest reply on Nov 8, 2009 by eduardoschardong

I has optimized this kernel. But the performance is not very good.

Are there some special tricks in Brook+, which I have not used for this kernel?

kernel void
kernel_brook1(int nCha, int pSize, int gSize, int AF, int nPatLin, int nCol, int nLines, int halbpSize,  int firstToSkip, int oWidth, int iWidth, int wWidth, int nChapSize, int SkipLines,float2 dataIn[][], float2 WsI[][], out float2 dataOut<>{

    float2 res = float2(0.0f,0.0f);
    int2 pos = instance().xy;
    float2 w1,w2,w3,w4,x1,x2,x3,x4;
    int Y = pos.y / 4;
    int X = pos.y%4*oWidth + pos.x;//(pos.y - Y * 4)*oWidth + pos.x;
    int cntG = Y / gSize;
    int cntAF = Y - gSize * cntG;
    int cntCha = X / nCol;
    int cntP = X%nCol; //X - cntCha*nCol;
    int dataN = nChapSize; // number of source samples
    int Widx, Inputidx;
    int k = 0;

    //compute start index in weights matrix
    Widx = nChapSize *gSize * cntCha + nChapSize * cntAF;//vvvvv*******

    //compute start index in input matrix
    if(cntG >= firstToSkip)cntG = cntG + SkipLines;
    Inputidx = nCha * (cntG - halbpSize + 1);


    //scalar product
    while(k < dataN){
        w1 = WsI[cntP][Widx];
        Widx += 1;
        w2 = WsI[cntP][Widx];
        Widx += 1;
        w3 = WsI[cntP][Widx];
        Widx += 1;
        w4 = WsI[cntP][Widx];
        Widx += 1;
        x1 = dataIn[cntP][Inputidx];
        Inputidx += 1;
        x2 = dataIn[cntP][Inputidx];
        Inputidx += 1;
        x3 = dataIn[cntP][Inputidx];
        Inputidx += 1;
        x4 = dataIn[cntP][Inputidx];
        Inputidx += 1;
        res.y += w1.y * x1.x + w1.x * x1.y + w2.y * x2.x + w2.x * x2.y + w3.y * x3.x + w3.x * x3.y + w4.y * x4.x + w4.x * x4.y;
        res.x += w1.x * x1.x - w1.y * x1.y + w2.x * x2.x - w2.y * x2.y + w3.x * x3.x - w3.y * x3.y + w4.x * x4.x - w4.y * x4.y;
        k += 4;
    }

    dataOut =  res;

}

Outcomes