Hi,
i've compared 2 versions of a simple 2D convolution. First the "unoptimized" without const and __global isntead of __constant and the second one with const.
The "optimized" has nearly 2x ALUInsts, but with small filter sizes 3-31 its faster and with a filtersize of 101 its slower. I don't understand this behaviour
The output data size is 1024x1024.
The device is a HD 5850.
Optimized (Size 29)
Method | ExecutionOrder | GlobalWorkSize | GroupWorkSize | Time | LDSSize | DataTransferSize | GPRs | ScratchRegs | FCStacks | Wavefronts | ALUInsts | FetchInsts | WriteInsts | LDSFetchInsts | LDSWriteInsts | ALUBusy | ALUFetchRatio | ALUPacking | FetchSize | CacheHit | FetchUnitBusy | FetchUnitStalled | WriteUnitStalled | FastPath | CompletePath | PathUtilization | ALUStalledByLDS | LDSBankConflict |
WriteBuffer | 105 | 2,8396 | 4323,06 | |||||||||||||||||||||||||
WriteBuffer | 106 | 0,19504 | 3,29 | |||||||||||||||||||||||||
convolve2D_0558A308 | 107 | { 1024 1024 1} | { 256 1 1} | 101,06767 | 0 | 14 | 0 | 3 | 16384 | 6918 | 1682 | 1 | 0 | 0 | 56,08 | 4,11 | 37,21 | 2123265,56 | 0,31 | 30,36 | 0 | 0 | 4095 | 0 | 100 | 0 | 0 | |
convolve2D_0558A308 | 108 | { 1024 1024 1} | { 256 1 1} | 101,04606 | 0 | 14 | 0 | 3 | 16384 | 6918 | 1682 | 1 | 0 | 0 | 56,06 | 4,11 | 37,21 | 2123265,56 | 0,3 | 30,35 | 0 | 0 | 4095 | 0 | 100 | 0 | 0 | |
convolve2D_0558A308 | 109 | { 1024 1024 1} | { 256 1 1} | 101,09112 | 0 | 14 | 0 | 3 | 16384 | 6918 | 1682 | 1 | 0 | 0 | 56,06 | 4,11 | 37,21 | 2123265,56 | 0,31 | 30,35 | 0 | 0 | 4095 | 0 | 100 | 0 | 0 | |
convolve2D_0558A308 | 110 | { 1024 1024 1} | { 256 1 1} | 101,09242 | 0 | 14 | 0 | 3 | 16384 | 6918 | 1682 | 1 | 0 | 0 | 56,07 | 4,11 | 37,21 | 2123265,56 | 0,31 | 30,35 | 0 | 0 | 4095 | 0 | 100 | 0 | 0 | |
convolve2D_0558A308 | 111 | { 1024 1024 1} | { 256 1 1} | 101,06491 | 0 | 14 | 0 | 3 | 16384 | 6918 | 1682 | 1 | 0 | 0 | 56,06 | 4,11 | 37,21 | 2123265,56 | 0,3 | 30,35 | 0 | 0 | 4095 | 0 | 100 | 0 | 0 | |
ReadBuffer | 112 | 2,50243 | 4096 |
Unoptimized (Size 29)
Method | ExecutionOrder | GlobalWorkSize | GroupWorkSize | Time | LDSSize | DataTransferSize | GPRs | ScratchRegs | FCStacks | Wavefronts | ALUInsts | FetchInsts | WriteInsts | LDSFetchInsts | LDSWriteInsts | ALUBusy | ALUFetchRatio | ALUPacking | FetchSize | CacheHit | FetchUnitBusy | FetchUnitStalled | WriteUnitStalled | FastPath | CompletePath | PathUtilization | ALUStalledByLDS | LDSBankConflict |
WriteBuffer | 105 | 2,82946 | 4323,06 | |||||||||||||||||||||||||
WriteBuffer | 106 | 0,17178 | 3,29 | |||||||||||||||||||||||||
convolve2D_0549A690 | 107 | { 1024 1024 1} | { 256 1 1} | 120,4313 | 0 | 13 | 0 | 3 | 16384 | 3555 | 1682 | 1 | 0 | 0 | 23,05 | 2,11 | 34,57 | 2525696 | 0 | 32,23 | 7,96 | 0 | 4095 | 0 | 100 | 0 | 0 | |
convolve2D_0549A690 | 108 | { 1024 1024 1} | { 256 1 1} | 127,28908 | 0 | 13 | 0 | 3 | 16384 | 3555 | 1682 | 1 | 0 | 0 | 23,35 | 2,11 | 34,57 | 2525696 | 0 | 32,22 | 7,62 | 0 | 4095 | 0 | 100 | 0 | 0 | |
convolve2D_0549A690 | 109 | { 1024 1024 1} | { 256 1 1} | 127,61811 | 0 | 13 | 0 | 3 | 16384 | 3555 | 1682 | 1 | 0 | 0 | 23,45 | 2,11 | 34,57 | 2525696 | 0 | 32,14 | 7,44 | 0 | 4095 | 0 | 100 | 0 | 0 | |
convolve2D_0549A690 | 110 | { 1024 1024 1} | { 256 1 1} | 125,39515 | 0 | 13 | 0 | 3 | 16384 | 3555 | 1682 | 1 | 0 | 0 | 22,68 | 2,11 | 34,57 | 2525696 | 0 | 32,05 | 8,15 | 0 | 4095 | 0 | 100 | 0 | 0 | |
convolve2D_0549A690 | 111 | { 1024 1024 1} | { 256 1 1} | 125,41467 | 0 | 13 | 0 | 3 | 16384 | 3555 | 1682 | 1 | 0 | 0 | 23,29 | 2,11 | 34,57 | 2525696 | 0 | 32,17 | 7,63 | 0 | 4095 | 0 | 100 | 0 | 0 | |
ReadBuffer | 112 | 3,31465 | 4096 |
Optimized (Size 101)
Method | ExecutionOrder | GlobalWorkSize | GroupWorkSize | Time | LDSSize | DataTransferSize | GPRs | ScratchRegs | FCStacks | Wavefronts | ALUInsts | FetchInsts | WriteInsts | LDSFetchInsts | LDSWriteInsts | ALUBusy | ALUFetchRatio | ALUPacking | FetchSize | CacheHit | FetchUnitBusy | FetchUnitStalled | WriteUnitStalled | FastPath | CompletePath | PathUtilization | ALUStalledByLDS | LDSBankConflict |
WriteBuffer | 1 | 4,46239 | 4935,06 | |||||||||||||||||||||||||
WriteBuffer | 2 | 0,7072 | 39,85 | |||||||||||||||||||||||||
convolve2D_00F1A808 | 3 | { 1024 1024 1} | { 256 1 1} | 1221,3376 | 0 | 14 | 0 | 3 | 16384 | 82230 | 20402 | 1 | 0 | 0 | 55,13 | 4,03 | 37,42 | 25778451,9 | 0,79 | 30,45 | 0 | 0 | 4095 | 0 | 100 | 0 | 0 | |
convolve2D_00F1A808 | 4 | { 1024 1024 1} | { 256 1 1} | 1221,22536 | 0 | 14 | 0 | 3 | 16384 | 82230 | 20402 | 1 | 0 | 0 | 55,19 | 4,03 | 37,42 | 25778451,9 | 0,84 | 30,49 | 0 | 0 | 4095 | 0 | 100 | 0 | 0 | |
convolve2D_00F1A808 | 5 | { 1024 1024 1} | { 256 1 1} | 1221,74639 | 0 | 14 | 0 | 3 | 16384 | 82230 | 20402 | 1 | 0 | 0 | 55,17 | 4,03 | 37,42 | 25778451,9 | 0,83 | 30,48 | 0 | 0 | 4095 | 0 | 100 | 0 | 0 | |
convolve2D_00F1A808 | 6 | { 1024 1024 1} | { 256 1 1} | 1220,52834 | 0 | 14 | 0 | 3 | 16384 | 82230 | 20402 | 1 | 0 | 0 | 55,18 | 4,03 | 37,42 | 25778451,9 | 0,85 | 30,48 | 0 | 0 | 4095 | 0 | 100 | 0 | 0 | |
convolve2D_00F1A808 | 7 | { 1024 1024 1} | { 256 1 1} | 1220,82929 | 0 | 14 | 0 | 3 | 16384 | 82230 | 20402 | 1 | 0 | 0 | 55,17 | 4,03 | 37,42 | 25778451,9 | 0,85 | 30,48 | 0 | 0 | 4095 | 0 | 100 | 0 | 0 | |
ReadBuffer | 8 | 1,75211 | 4096 |
Unoptimized (Size 101)
Method | ExecutionOrder | GlobalWorkSize | GroupWorkSize | Time | LDSSize | DataTransferSize | GPRs | ScratchRegs | FCStacks | Wavefronts | ALUInsts | FetchInsts | WriteInsts | LDSFetchInsts | LDSWriteInsts | ALUBusy | ALUFetchRatio | ALUPacking | FetchSize | CacheHit | FetchUnitBusy | FetchUnitStalled | WriteUnitStalled | FastPath | CompletePath | PathUtilization | ALUStalledByLDS | LDSBankConflict |
WriteBuffer | 1 | 4,28947 | 4935,06 | |||||||||||||||||||||||||
WriteBuffer | 2 | 2,48555 | 39,85 | |||||||||||||||||||||||||
convolve2D_00F4A810 | 3 | { 1024 1024 1} | { 256 1 1} | 1020,59145 | 0 | 13 | 0 | 3 | 16384 | 41427 | 20402 | 1 | 0 | 0 | 32,17 | 2,03 | 34,88 | 30981376 | 0 | 37,66 | 2,39 | 0 | 4095 | 0 | 100 | 0 | 0 | |
convolve2D_00F4A810 | 4 | { 1024 1024 1} | { 256 1 1} | 993,80199 | 0 | 13 | 0 | 3 | 16384 | 41427 | 20402 | 1 | 0 | 0 | 33,41 | 2,03 | 34,88 | 30981376 | 0 | 38,47 | 1,83 | 0 | 4095 | 0 | 100 | 0 | 0 | |
convolve2D_00F4A810 | 5 | { 1024 1024 1} | { 256 1 1} | 976,17333 | 0 | 13 | 0 | 3 | 16384 | 41427 | 20402 | 1 | 0 | 0 | 34,27 | 2,03 | 34,88 | 30981376 | 0 | 39,08 | 1,51 | 0 | 4095 | 0 | 100 | 0 | 0 | |
convolve2D_00F4A810 | 6 | { 1024 1024 1} | { 256 1 1} | 998,36419 | 0 | 13 | 0 | 3 | 16384 | 41427 | 20402 | 1 | 0 | 0 | 32,24 | 2,03 | 34,88 | 30981376 | 0 | 37,83 | 2,48 | 0 | 4095 | 0 | 100 | 0 | 0 | |
convolve2D_00F4A810 | 7 | { 1024 1024 1} | { 256 1 1} | 1038,83055 | 0 | 13 | 0 | 3 | 16384 | 41427 | 20402 | 1 | 0 | 0 | 33,57 | 2,03 | 34,88 | 30981376 | 0 | 38,45 | 1,64 | 0 | 4095 | 0 | 100 | 0 | 0 | |
ReadBuffer | 8 | 1,99185 | 4096 |
thanks,
rgds
__kernel void convolve2D( __global float *src, __global float *filter, int filterWidth, int imageWidth, __global float *dest){ int destWidth = get_global_size(0), xOut = get_global_id(0), yOut = get_global_id(1); float sum = 0; for (int y = 0; y < filterWidth; ++y){ int idxFY = y*filterWidth; int idxInY = (yOut + y) * imageWidth + xOut; for (int x = 0; x < filterWidth; ++x){ int idxF = idxFY + x; int idxIn = idxInY + x; sum += filter[idxF]*src[idxIn]; } } int idxOut = yOut * destWidth + xOut; dest[idxOut] = sum; } // "optimized" __kernel void convolve2D(const __global float *src, __constant float *filter, const int filterWidth, const int imageWidth, __global float *dest){ const int destWidth = get_global_size(0), xOut = get_global_id(0), yOut = get_global_id(1); float sum = 0; for (int y = 0; y < filterWidth; ++y){ const int idxFY = y*filterWidth; const int idxInY = (yOut + y) * imageWidth + xOut; for (int x = 0; x < filterWidth; ++x){ const int idxF = idxFY + x; const int idxIn = idxInY + x; sum += filter[idxF]*src[idxIn]; } } const int idxOut = yOut * destWidth + xOut; dest[idxOut] = sum; }
ahh ok, but why are there more alu-instructions, if i make the index variables const?