cancel
Showing results for 
Search instead for 
Did you mean: 

Archives Discussions

riza_guntur
Journeyman III

Wrong scatter tutorial in samples

I see

// Output stream for scatter is always 1D and float4
    unsigned int streamSizeScatter[] = {height, width};
    brook::Stream<float> outputStream(2, streamSizeScatter);

Isn't that 2D? Why the comment said has to be 1D and float4?

0 Likes
4 Replies
gaurav_garg
Adept I

Looks like this comment is wrong. These limitations were there before Brook+1.3 release.

0 Likes

Btw I found the samples to produce incorrect results when width != height

I try to correct but no luck

Here is my code:

int
main(int argc, char* argv[])
{
    // Specifying the width and height of the 2D buffer
    const unsigned int width = 10;
    const unsigned int height = 5;

    //--------------------------------------------------------------------------
    // Creating and initializing the input buffer
    //--------------------------------------------------------------------------

    // Creating an input buffer
    float* inputBuffer = new float[width * height];

    // Initializing the input buffer such that
    // input(i,j) = i*width + j
    fillBuffer(inputBuffer, width, height);

    // Printing input buffer
    fprintf(stdout, "Input buffer:\n");
    printBuffer(inputBuffer, width, 0, 0, width, height);

    //--------------------------------------------------------------------------
    // Creating the input stream and copying data from input buffer
    //--------------------------------------------------------------------------

    // Specifying the size of the 2D stream
    unsigned int streamSize[] = {width, height};

    // Specifying the rank of the stream
    unsigned int rank = 2;

    // Create a 2D stream of specified size i.e. 64x64 floating-point values   
    brook::Stream<float> inputStream(rank, streamSize);

    // Copying data from input buffer to input stream
    inputStream.read(inputBuffer);

    //--------------------------------------------------------------------------
    // Creating the output stream
    //--------------------------------------------------------------------------
   
    // Output stream for scatter is always 1D and float4
    unsigned int streamSizeScatter[] = {height, width};
    brook::Stream<float> outputStream(2, streamSizeScatter);

    //--------------------------------------------------------------------------
    // Executing kernel and copying back data
    //--------------------------------------------------------------------------   

    // Calling the kernel on the input and output streams
    scatterTransposeGPU.domainOffset(uint4(0,0,0,0));
    scatterTransposeGPU.domainSize(uint4(height,width,1,1));
    scatterTransposeGPU(inputStream, outputStream);

    // Creating an output buffer
    float* outputBuffer = new float[width * height];
    float* cpuOutputBuffer = new float[width * height];
    memset(cpuOutputBuffer, 0, width * height * sizeof(float));

    // Copying data from output stream to output buffer
    outputStream.write(outputBuffer);

    // Check error on stream
    if(outputStream.error())
    {
        // Print error Log associated to stream
        fprintf(stdout, "%s\n", outputStream.errorLog());
    }

    fprintf(stdout, "Transpose:GPU Result\n");
    printBuffer(outputBuffer, height, 0, 0, height, width);
    printBuffer(outputBuffer, height, 0, 0, height, width);

    // creating CPU Result
    for(unsigned int i = 0; i < height; i++)
    {
        for(unsigned int j = 0; j < width; j++)
        {
            cpuOutputBuffer[j*height + i] = inputBuffer[i*width + j];
        }
    }

    // Printing CPU Result
    fprintf(stdout, "Transpose:CPU Result\n");
    printBuffer(cpuOutputBuffer, height, 0, 0, height, width);

    //--------------------------------------------------------------------------
    // Checking whether the result is correct or not
    //--------------------------------------------------------------------------

    if(!verify(cpuOutputBuffer, outputBuffer, height, width))
    {  
        fprintf(stdout, "Failed.\n");
    }
    else
    {       
        fprintf(stdout, "Passed.\n");
    }

    //--------------------------------------------------------------------------
    // Cleaning up
    //--------------------------------------------------------------------------
   
    delete[] inputBuffer;
    delete[] outputBuffer;
    delete[] cpuOutputBuffer;
    return 0;
}

The output I get is:

C:\Program Files\Brook+_1.4.1_beta\samples\bin\CPP\xp_x86_32>scatter_stream_kernel.exe
Input buffer:
  0       1       2       3       4       5       6       7       8       9
 10      11      12      13      14      15      16      17      18      19
 20      21      22      23      24      25      26      27      28      29
 30      31      32      33      34      35      36      37      38      39
 40      41      42      43      44      45      46      47      48      49

Transpose:GPU Result
  0      10      20      30      40
  1      11      21      31      41
  2      12      22      32      42
  3      13      23      33      43
  4      14      24      34      44
  4      24      44      64      84
  5      25      45      65      85
  6      26      46      66      86
  7      27      47      67      87
  8      28      48      68      88

  0      10      20      30      40
  1      11      21      31      41
  2      12      22      32      42
  3      13      23      33      43
  4      14      24      34      44
  4      24      44      64      84
  5      25      45      65      85
  6      26      46      66      86
  7      27      47      67      87
  8      28      48      68      88

Transpose:CPU Result
  0      10      20      30      40
  1      11      21      31      41
  2      12      22      32      42
  3      13      23      33      43
  4      14      24      34      44
  5      15      25      35      45
  6      16      26      36      46
  7      17      27      37      47
  8      18      28      38      48
  9      19      29      39      49

Failed.

What wrong here?

0 Likes

I am able to reproduce this issue. I am sure this is a recent issue and it used to work with Catalyst 9.2. I will file the bug in SF.

0 Likes

It used to work with Cat 9.2?

I revert back to 9.2 the result still the same

Input buffer:
  0       1       2       3       4       5       6       7       8       9      10      11      12      13      14      15
 16      17      18      19      20      21      22      23      24      25      26      27      28      29      30      31
 32      33      34      35      36      37      38      39      40      41      42      43      44      45      46      47
 48      49      50      51      52      53      54      55      56      57      58      59      60      61      62      63
 64      65      66      67      68      69      70      71      72      73      74      75      76      77      78      79
 80      81      82      83      84      85      86      87      88      89      90      91      92      93      94      95
 96      97      98      99     100     101     102     103     104     105     106     107     108     109     110     111
112     113     114     115     116     117     118     119     120     121     122     123     124     125     126     127

Transpose:GPU Result
  0      16      32      48      64      80      96     112
  1      17      33      49      65      81      97     113
  2      18      34      50      66      82      98     114
  3      19      35      51      67      83      99     115
  4      20      36      52      68      84     100     116
  5      21      37      53      69      85     101     117
  6      22      38      54      70      86     102     118
  7      23      39      55      71      87     103     119
  0      -2       0       0       0       0       0       0
  0       0       0       0       0       0       0       0
  0      64       4      68     128     192     132     196
2048    2112    2052    2116    2176    2240    2180    2244
2056    2120    2060    2124    2184    2248    2188    2252
  8      72      12      76     136     200     140     204
1040    1104    1044    1108    1168    1232    1172    1236
3088    3152    3092    3156    3216    3280    3220    3284

Transpose:CPU Result
  0      16      32      48      64      80      96     112
  1      17      33      49      65      81      97     113
  2      18      34      50      66      82      98     114
  3      19      35      51      67      83      99     115
  4      20      36      52      68      84     100     116
  5      21      37      53      69      85     101     117
  6      22      38      54      70      86     102     118
  7      23      39      55      71      87     103     119
  8      24      40      56      72      88     104     120
  9      25      41      57      73      89     105     121
 10      26      42      58      74      90     106     122
 11      27      43      59      75      91     107     123
 12      28      44      60      76      92     108     124
 13      29      45      61      77      93     109     125
 14      30      46      62      78      94     110     126
 15      31      47      63      79      95     111     127

0 Likes