Archives Discussions

kcin · ‎01-26-2012

First, I must say, it isn't a holy war post.

I am a newbie with AMD GPUs (previously worked with NVIDIA) so I need help with understanding the unexpected performance lack.

I need to implement on AMD GPUs a simple kernel which searches template images on the base one. I was able to test the kernel on NVIDIA GT 240, AMD HD 6670 and AMD HD 5850. The results were discouraged for me:

GT 240 - 100 ms,

HD 6670 - 200 ms,

HD 5850 - 69 ms.

It seems strange to me as both AMD GPUs are more powerful than GT 240. It looks like I don't get the best of AMD.

Below is the kernel I'm talking about. It looks for 6 equally sized template images in the base one. Every pixel of the base image is considered as a possible origin of one of the templates and is processed by a separate thread. The kernel compares R, G, B values of each pixel of the base image and of the template image, and if the difference for at least one color exceeds diff parameter, the corresponding pixel is counted nonmatched. If the number of nonmatched pixels is less than maxNonmatchQt the corresponding template is hit.

__constant int tOffset = 8196; // one template size in memory (in bytes) __kernel void matchImage6( __global unsigned char* image, // pointer to the base image int imgWidth, // base image width int imgHeight, // base image height int imgPitch, // base image pitch (in bytes) int imgBpp, // base image bytes per pixel __constant unsigned char* templates, // pointer to the array of templates int tWidth, // templates width (the same for all) int tHeight, // templates height (the same for all) int tPitch, // templates pitch (in bytes, the same for all) int tBpp, // templates bytes per pixel (the same for all) int diff, // max allowed difference of intensity int maxNonmatchQt, // max number of nonmatched pixels __global int* result, // results ) { int x0 = (int)get_global_id(0); int y0 = (int)get_global_id(1); if( x0 + tWidth > imgWidth || y0 + tHeight > imgHeight) return; int nonmatchQt[] = {0, 0, 0, 0, 0, 0}; for( int y = 0; y < tHeight; y++) { int ind = y * tPitch; int baseImgInd = (y0 + y) * imgPitch + x0 * imgBpp; for( int x = 0; x < tWidth; x++) { unsigned char c0 = image[baseImgInd]; unsigned char c1 = image[baseImgInd + 1]; unsigned char c2 = image[baseImgInd + 2]; for( int i = 0; i < 6; i++) if( abs( c0 - templates[i * tOffset + ind]) > diff || abs( c1 - templates[i * tOffset + ind + 1]) > diff || abs( c2 - templates[i * tOffset + ind + 2]) > diff) nonmatchQt++; ind += tBpp; baseImgInd += imgBpp; } if( nonmatchQt[0] > maxNonmatchQt && nonmatchQt[1] > maxNonmatchQt &&

nonmatchQt[2] > maxNonmatchQt && nonmatchQt[3] > maxNonmatchQt &&

nonmatchQt[4] > maxNonmatchQt && nonmatchQt[5] > maxNonmatchQt) return; } for( int i = 0; i < 6; i++) if( nonmatchQt < maxNonmatchQt) { unsigned int pos = atom_inc( &result[0]) * 3; result[pos + 1] = i; result[pos + 2] = x0; result[pos + 3] = y0; } }

Kernel run configuration: Global work size = (1900, 1200) Local work size = (32, 😎 for AMD and (32, 16) for NVIDIA.

Any remarks about my code are also highly appreciated.

MicahVillmow · ‎01-26-2012

A couple of quick things,

1) don't use 8bit types, you are wasting memory bandwidth, load in at least an integer. This is especially true with constant memory which loads in 16 bytes on dynamic indexing.

2) Merge your sequential loads into a single load of a larger type.

3) Don't use global atomics on the same pointer that does most of your memory writes, this forces all writes to the pointer down the slow path.

4) If you only increment atomics, use the atomic counter extension on AMD platforms, it is faster than global atomics.

5) Use vectors over scalars in calculations where possible.

6) Check your ISA to see if scratch memory is used because of nonMatchQt, if so, don't use private arrays(nonMatchQt), use local array instead.

7) Check ISA to see if loops are unrolled, if not use #pragma unroll 6 to force unrolling of inner loops.

kcin · ‎01-26-2012

Hello, I changed my kernel to:

__kernel void matchImage6(    __global unsigned char* image, 
                              int imgWidth, 
                              int imgHeight, 
                              int imgPitch, 
                              int imgBpp, 
                              __constant unsigned char* templates, 
                              int tWidth, 
                              int tHeight, 
                              int tPitch, 
                              int tBpp, 
                              int diff, 
                              int maxNonmatchQt, 
                              __global int* result
                              ) { 
     int x0 = (int)get_global_id(0); 
     int y0 = (int)get_global_id(1); 
     if( x0 + tWidth > imgWidth || y0 + tHeight > imgHeight) 
          return; 
     int nonmatchQt[] = {0, 0, 0, 0, 0, 0}; 
     for( int y = 0; y < tHeight; y++) { 
          int ind = y * tPitch; 
          int baseImgInd = ((y0 + y) * imgPitch + x0 * imgBpp) >> 2; 
          for( int x = 0; x < tWidth; x++) { 
               uchar4 imgPix = vload4( baseImgInd, image); 
               #pragma UNROLL 6 
               for( int i = 0; i < 6; i++) { 
                    uchar4 templPix = vload4( (i * tOffset + ind) >> 2, templates);
                    if( abs( imgPix.x - templPix.x) > diff || 
                              abs( imgPix.y - templPix.y) > diff || 
                              abs( imgPix.z - templPix.z) > diff) 
                         nonmatchQt++; 
               } 
               ind += tBpp; 
               baseImgInd ++; 
          } 
          if( nonmatchQt[0] > maxNonmatchQt 
                              && nonmatchQt[1] > maxNonmatchQt 
                              && nonmatchQt[2] > maxNonmatchQt 
                              && nonmatchQt[3] > maxNonmatchQt 
                              && nonmatchQt[4] > maxNonmatchQt 
                              && nonmatchQt[5] > maxNonmatchQt) 
               return; 
     } 
     for( int i = 0; i < 6; i++) 
          if( nonmatchQt < maxNonmatchQt)
               if( result[0] < MAX_RESULT_BYTES - 2) { 
                    int pos = atom_inc( &result[0]) * 3; 
                    result[pos + 1] = i; 
                    result[pos + 2] = x0; 
                    result[pos + 3] = y0; 
               }
}

However, it became only slower (73 ms instead of 69) as it required one more VGPR to store vectors. Scratch memory is not used by the kernel and atomic operations are not the case since only few threads reach there. The code change that gives a really big improvement is the usage of vector operations in the most inner cycle:

uchar4 res = imgPix - templPix; 
if( res.x > diff || res.y > diff || res.z > diff) 
     nonmatchQt++;

The execution time reduced almost twice (to 45 ms). But I don't know how to effectively implement abs(uchar - uchar)operation for vectors which is necessary. Hence this change is seemed to be impossible. It looks like my problem can't be vectorized effectively.

tzachi_cohen · ‎01-27-2012

If you like to vectorize abs(uchar - uchar) you can write:

uchar4 c = max(a,b);

uchar4 d = min(a,b);

uchar4 e = c-d;

Tzachi Cohen, AMD

--------------------------------

The information presented in this document is for informational purposes only and may contain technical inaccuracies, omissions and typographical errors. Links to third party sites are for convenience only, and no endorsement is implied.

kcin · ‎01-27-2012

It works! Now 50 ms instead of 73.

MicahVillmow · ‎01-27-2012

kcin,

Try the abs_diff(a, b), which reproduces the function abs(a - b). See 6.12.3 of the OpenCL spec.

For atomics, it doesn't matter how many threads reach atomics, atomic operations on a pointer cause all memory accesses on that pointer to go down the slow path. It is a compile time decision and not a runtime decision. If you do atomic counters or an atomic operation to a different pointer, your memory writes should be faster. However, if your program is not memory bound this might not make any different in performance, but is something to keep in mind.

This code:

 if( abs( imgPix.x - templPix.x) > diff || 
                              abs( imgPix.y - templPix.y) > diff || 
                              abs( imgPix.z - templPix.z) > diff) 
                         nonmatchQt++;

Might be written faster this way:

nonmatchQt += (uint)(any(abs_diff(imgPix, templPix).xyz > char3(diff)));

kcin · ‎01-27-2012

Thank you, Micah, for useful information. I tried abs_diff( a, b). Surprisingly, it was slower than max(a, b) - min( a,b). At least in my case. I will try your another suggestion later.

Another feature I can't understand is why an offset in vloadn() function is multiplied by n inside the function. It is strange for char datatype which should not be aligned.

MicahVillmow · ‎01-27-2012

kcin,

That is weird that it is slower.... abs_diff(a, b) is implemented as max(a, b) - min(a, b) and I did a small test to compare the generated code for the two and the abs_diff(a, b) on a uchar4 data type abs_diff(a, b) uses 4 fewer instructions. What this might be doing is causing other parts of your code to become a bottleneck(i.e. your memory writes using the slow path), slowing down the entire program.

Skysnake · ‎01-28-2012

Ok,

i hope i can help you to optimize your code.

How far i can see, you do not use the shared memory. This have very often a strong performance impact. You also do a lot of atomic accesses. So this is also a good reason to use shared memory. So you can work per Workgroup more without a atomic access into the global memory.

To see if you are memory bound, you should sum up the Loads/Writes from the global memory and compare it with the runtime. I think you should be memorybandwith limited, because you more or less do not reuse your loaded Data from the global memory, as far i can see.

So you can do the following.

Load a stripe of data into the shared memory, and than access the data from there, so you can save lots of memory bandwith to the global memory.

As next part, please do the atomic acceses first also in the shared memory and then write only one value back to the global memory. This should give you also a big speedup.

I think there are a lot of possebilities to speed up your program bye a factor of 10 or someting like this.

kcin · ‎01-28-2012

Thank you for your answer.

APP SDK Profiler shows that I'm strongly limited with ALU operations. That is true, because removing the most inner if operator drastically improves performance. Again, according to Profiler, memory bandwidth is high enough, I believe because many workgroups hide latency well. So I'm not sure the shared memory helps.

I tried to delete the last for cycle. The time change was really small. Additionally, only a few threads reach this point, so it isn't bottleneck, as far as I can see.

Skysnake · ‎01-28-2012

I cant really believe this. You calculate more or less nothing, or not very much as far i can see.

I think you spent most of the time for the atomic accesses. I think you should be able to do most of the work inside the shared memory, with the if's.

Just try it. It is not soooo much work to do this.

kcin · ‎01-28-2012

I assure you that the atomic access is not a big deal. 99.99% of all threads exit the kernel without the atomic access usage. (I'm not talking now about the slow path of memory access MicahVillmow told about. However, removing the atomics completely do not improve time significantly.)

I can't say whether or not the shared memory is useful as I have not tried this. Possibly I will do. If so, I will surely post the results of the experiment here.

notzed · ‎02-02-2012

I'm not up on the performance characteristics of those devices, but to me those numbers look a bit off their potential. Possibly by an order of magnitude although it's been a while since I wrote similar stuff so I could be out.

Apart from some basics:

a) Don't use RGB format. At least use RGBx to you can use uchar4 (or even int), and if you need more alignment for better performance (so e.g. you can use char16), then just require that for the api. If you must, at least use a uchar3 pointer, and pretty much never use uchar unless you're really dealing with chars (like text). Sometimes its faster to copy a whole image and re-arrange it in a format that matches an algorithm - you can probably write more efficient re-arrangement code if that's all it does, and it can remove all the special and edge cases from the algorithm.

b) or use bitplanes - this would allow you for example to triple your parallelism since you can just do r g b separately and combine them afterwards.

c) on amd use an atomic counter. I know it's probably no issue here, but from memory the function is the same so the code changes are minimal and clean.

d) don't use vload either. Just use a vector type and access it as an array (well when i first dabbled, vload was slow as a wet week, but that was a long time ago).

e) don't use short-cut logic on arithmetic (i.e. use | not ||). Presumably the compiler wont bother if the expressions have no side-effects, but it's better to be safe than sorry.

f) for problems like this specify values that don't change depending on the data but are critical to performance such as the template size as constants (i.e. defines). Then compile a new instance of the kernel for different sizes if you need to by passing -Dblah=blah and so on via the compiler command line.

In my experience, as soon as you use a piece of data more than once, the main thing is to avoid the global memory accesses: and template matching needs to read the same location many times. The reads you have are (mostly) coalesced, but it's still reading memory for every pixel test. For example, it should be possible to perform 32 tests of a row with only 1 (or 2) read(s) of the source image rather than 32.

Assuming you had 64x1 threads working together on one 32 pixel wide section of source image and template (off the top of my head: i think the addressing below should work):

lx <- local.x

source.x <- global.x / 64 * 32 (64 threads work on 32 pixels of source at once)

local array 'source' is uchar4[64]

local array 'template' is uchar4[64]

1) read 64 pixels from source.x +lx location into shared memory source[lx] (1 coalesced read, each thread: 1 pixel). use clamping/an if to make sure it doesn't go beyond the bounds of the image.

2) loop over 3 sets of two templates:

3) read 64x1 of template (i.e. two templates, stored tiled together) to shared memory template[lx] (each thread: 1 pixel)

barrier(LOCAL_MEM_FENCE);

4) loop (0:32) over x, and perform a test between template(x) and source((lx&31) + x) from shared memory (note that this loop is independent of external memory address calculations so might optimise better).

(this means the first 32 threads test the source against template 0, and the second 32 test against template 1)

(if your data is RGB, just use pixel.xyz anywhere you access it)

barrier(LOCAL_MEM_FENCE);

5) end loop 2)

If you specify the reqd_group_size(64, 1, 1) attribute (iirc,manuals not handy) on the kernel, and it is possible, the compiler will remove the barriers on amd hardware. The inner loop can be simpler too since you're only doing 1 test and not 6 and it can just work on a register. Obviously repeat the above to test a whole template.

So with only 1 + (ntemplates/2) = 4 global memory reads you've performed 32 location tests against 1 row of all 6 templates.

Compare this to the (6+1) * 32 = 224 global memory reads required for the non-LDS version.

The only caveat is that you cannot short-circuit execution, all tasks have to perform all operations since they work together and must all execute the same barriers: but this is usually faster anyway since the hardware is probably already doing this and you remove the unnecessary exist tests.

kcin · ‎02-17-2012

Thank you for the time you spent to write so detailed answer. Some of your recommendations were really helpful.

In fact, vload4() is extremely slow comparing to uchar4 pointers. I did not use AMD atomic counters in order to provide OpenCL compatibility for NVIDIA too.

Some more observations on suggestions from this thread:

- a usage of any() function significantly slows the program down,

- I observed no difference between | and ||

- I programmed a kernel which used a shared memory. However, I got even decreased performance. Possibly, it is because my code quality , however I think that a latency of the global memory reading is very well hidden by many threads, so we spend more time on synchronization of the local memory writing and excessive reading.

Below is the final version of the kernel for whom it may be interested in. It takes now 17 ms on HD 5850 and 44 ms on GT 240. It should be noticed that a very remarkable improvance was brought by if( noiseQt > maxNoiseQt) condition in the most inner cycle

#define maxNoiseQt 5

#define tOffset 8196

#define MAX_RESULT_BYTES 1000

__kernel void matchImage6( __global uchar4* image,

int imgWidth,

int imgHeight,

int imgPitch,

__constant uchar4* templates,

int tWidth,

int tHeight,

int tPitch,

uchar diff,

__global int* result) {

int x0 = (int)get_global_id(0);

int y0 = (int)get_global_id(1);

if( x0 + tWidth > imgWidth | y0 + tHeight > imgHeight)

return;

int noiseQt[] = {0, 0, 0, 0, 0, 0};

for( int y = 0; y < tHeight; y++) {

int ind = y * tPitch;

int baseImgInd = (y0 + y) * imgPitch + x0;

for( int x = 0; x < tWidth; x++) {

uchar4 imgPix = image[baseImgInd];

#pragma UNROLL 6

for( int i = 0; i < 6; i++) {

if( noiseQt > maxNoiseQt)

continue;

uchar4 templPix = templates[(i * tOffset >> 2) + ind];

uchar4 res = max( imgPix, templPix) - min( imgPix, templPix);

if( res.x > diff | res.y > diff | res.z > diff)

noiseQt++;

}

ind++;

baseImgInd ++;

}

if( noiseQt[0] > maxNoiseQt && noiseQt[1] > maxNoiseQt && noiseQt[2] > maxNoiseQt && noiseQt[3] > maxNoiseQt && noiseQt[4] > maxNoiseQt && noiseQt[5] > maxNoiseQt)

return;

}

for( int i = 0; i < 6; i++)

if( noiseQt < maxNoiseQt) {

if( result[0] < MAX_RESULT_BYTES / 2) {

int pos = atom_inc( &result[0]) * 3;

result[pos + 1] = i;

result[pos + 2] = x0;

result[pos + 3] = y0;

}

Bdot · ‎02-21-2012

Not sure what the compiler can optimize about it, but both pos and *result are of type int. atom_inc, however operates on (u)long values. The way you use it, there should be two unnecessary conversions from int to long and back. atomic_inc is probably what you want.

BTW, I think it's weird that most of the stuff that involves different data types is handled by overloading. But here you have to use different functions ...

LeeHowes · ‎02-21-2012

The reason it's not overloaded is that atom_inc is the extension version, atomic_inc is the core. Before atomics were pulled into core the cl_khr_global_int32_base_atomics contained atom_inc for int.

http://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/cl_khr_global_int32_base_atomics.html

Archives Discussions

AMD vs NVIDIA performance