I see
// Output stream for scatter is always 1D and float4
unsigned int streamSizeScatter[] = {height, width};
brook::Stream<float> outputStream(2, streamSizeScatter);
Isn't that 2D? Why the comment said has to be 1D and float4?
Looks like this comment is wrong. These limitations were there before Brook+1.3 release.
Btw I found the samples to produce incorrect results when width != height
I try to correct but no luck
Here is my code:
int
main(int argc, char* argv[])
{
// Specifying the width and height of the 2D buffer
const unsigned int width = 10;
const unsigned int height = 5;
//--------------------------------------------------------------------------
// Creating and initializing the input buffer
//--------------------------------------------------------------------------
// Creating an input buffer
float* inputBuffer = new float[width * height];
// Initializing the input buffer such that
// input(i,j) = i*width + j
fillBuffer(inputBuffer, width, height);
// Printing input buffer
fprintf(stdout, "Input buffer:\n");
printBuffer(inputBuffer, width, 0, 0, width, height);
//--------------------------------------------------------------------------
// Creating the input stream and copying data from input buffer
//--------------------------------------------------------------------------
// Specifying the size of the 2D stream
unsigned int streamSize[] = {width, height};
// Specifying the rank of the stream
unsigned int rank = 2;
// Create a 2D stream of specified size i.e. 64x64 floating-point values
brook::Stream<float> inputStream(rank, streamSize);
// Copying data from input buffer to input stream
inputStream.read(inputBuffer);
//--------------------------------------------------------------------------
// Creating the output stream
//--------------------------------------------------------------------------
// Output stream for scatter is always 1D and float4
unsigned int streamSizeScatter[] = {height, width};
brook::Stream<float> outputStream(2, streamSizeScatter);
//--------------------------------------------------------------------------
// Executing kernel and copying back data
//--------------------------------------------------------------------------
// Calling the kernel on the input and output streams
scatterTransposeGPU.domainOffset(uint4(0,0,0,0));
scatterTransposeGPU.domainSize(uint4(height,width,1,1));
scatterTransposeGPU(inputStream, outputStream);
// Creating an output buffer
float* outputBuffer = new float[width * height];
float* cpuOutputBuffer = new float[width * height];
memset(cpuOutputBuffer, 0, width * height * sizeof(float));
// Copying data from output stream to output buffer
outputStream.write(outputBuffer);
// Check error on stream
if(outputStream.error())
{
// Print error Log associated to stream
fprintf(stdout, "%s\n", outputStream.errorLog());
}
fprintf(stdout, "Transpose:GPU Result\n");
printBuffer(outputBuffer, height, 0, 0, height, width);
printBuffer(outputBuffer, height, 0, 0, height, width);
// creating CPU Result
for(unsigned int i = 0; i < height; i++)
{
for(unsigned int j = 0; j < width; j++)
{
cpuOutputBuffer[j*height + i] = inputBuffer[i*width + j];
}
}
// Printing CPU Result
fprintf(stdout, "Transpose:CPU Result\n");
printBuffer(cpuOutputBuffer, height, 0, 0, height, width);
//--------------------------------------------------------------------------
// Checking whether the result is correct or not
//--------------------------------------------------------------------------
if(!verify(cpuOutputBuffer, outputBuffer, height, width))
{
fprintf(stdout, "Failed.\n");
}
else
{
fprintf(stdout, "Passed.\n");
}
//--------------------------------------------------------------------------
// Cleaning up
//--------------------------------------------------------------------------
delete[] inputBuffer;
delete[] outputBuffer;
delete[] cpuOutputBuffer;
return 0;
}
The output I get is:
C:\Program Files\Brook+_1.4.1_beta\samples\bin\CPP\xp_x86_32>scatter_stream_kernel.exe
Input buffer:
0 1 2 3 4 5 6 7 8 9
10 11 12 13 14 15 16 17 18 19
20 21 22 23 24 25 26 27 28 29
30 31 32 33 34 35 36 37 38 39
40 41 42 43 44 45 46 47 48 49
Transpose:GPU Result
0 10 20 30 40
1 11 21 31 41
2 12 22 32 42
3 13 23 33 43
4 14 24 34 44
4 24 44 64 84
5 25 45 65 85
6 26 46 66 86
7 27 47 67 87
8 28 48 68 88
0 10 20 30 40
1 11 21 31 41
2 12 22 32 42
3 13 23 33 43
4 14 24 34 44
4 24 44 64 84
5 25 45 65 85
6 26 46 66 86
7 27 47 67 87
8 28 48 68 88
Transpose:CPU Result
0 10 20 30 40
1 11 21 31 41
2 12 22 32 42
3 13 23 33 43
4 14 24 34 44
5 15 25 35 45
6 16 26 36 46
7 17 27 37 47
8 18 28 38 48
9 19 29 39 49
Failed.
What wrong here?
I am able to reproduce this issue. I am sure this is a recent issue and it used to work with Catalyst 9.2. I will file the bug in SF.
It used to work with Cat 9.2?
I revert back to 9.2 the result still the same
Input buffer:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
Transpose:GPU Result
0 16 32 48 64 80 96 112
1 17 33 49 65 81 97 113
2 18 34 50 66 82 98 114
3 19 35 51 67 83 99 115
4 20 36 52 68 84 100 116
5 21 37 53 69 85 101 117
6 22 38 54 70 86 102 118
7 23 39 55 71 87 103 119
0 -2 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 64 4 68 128 192 132 196
2048 2112 2052 2116 2176 2240 2180 2244
2056 2120 2060 2124 2184 2248 2188 2252
8 72 12 76 136 200 140 204
1040 1104 1044 1108 1168 1232 1172 1236
3088 3152 3092 3156 3216 3280 3220 3284
Transpose:CPU Result
0 16 32 48 64 80 96 112
1 17 33 49 65 81 97 113
2 18 34 50 66 82 98 114
3 19 35 51 67 83 99 115
4 20 36 52 68 84 100 116
5 21 37 53 69 85 101 117
6 22 38 54 70 86 102 118
7 23 39 55 71 87 103 119
8 24 40 56 72 88 104 120
9 25 41 57 73 89 105 121
10 26 42 58 74 90 106 122
11 27 43 59 75 91 107 123
12 28 44 60 76 92 108 124
13 29 45 61 77 93 109 125
14 30 46 62 78 94 110 126
15 31 47 63 79 95 111 127