Fr4nz

Performance inconsistencies when testing various bit-counting methods

Discussion created by Fr4nz on Mar 18, 2010
Latest reply on Mar 20, 2010 by Fr4nz

Yesterday I spent part of my day testing various bit-counting methods on my ATI 5770 and I'm incurring in strange performance inconsistencies, that is, bit-counting methods that should be faster are slower. I'm here in order to understand if these inconsistencies have to do with hardware limits, poor kernel design or inefficiences/bugs in current ATI OpenCL implementation.


All the kernels I'm reporting below as references makes these simple things: consider an uint4 variable, let's call it tmpIntersect, that is continuously updated by some AND-operations. Then, every time this variable is updated:

1) Apply a bit-counting method in order to count the bits contained in tmpIntersect;
2) Update the partial sum, tmpCOUNTS, with the result obtained in "1)";


The bit-counting methods I've tested are 3 (I'm leaving out the MIT-Hakmem method for the moment), all taken from http://tinyurl.com/6p5rm9 and adapted for GPU and vectorization:

1) Naive-vectorized method (in theory the slowest one): it consists of a FOR loop that executes 32 times 3 operations for every tmpIntersect component: 1) an AND with a mask, in order to check if the least significant bit is equal to "1", 2) a SUM that increases the partial sum with the result obtained from the AND and, in the end, 3) a RIGHT SHIFT.
All these operations can be made using vectorized variables, so we have parallelism at operation level;

2) Usage of a precomputed 8-bit lookup table (in theory faster than naive-vectorized method): at the host side we precompute a table with 256 entries, each one containing the number of "1" bits that its corresponding index, thought as the numerical value of an 8-bit unsigned word, contains. This table is loaded in constant memory as an uchar array.
In the kernel, when we want to compute the number of "1" bits in tmpIntersect, for every component we mask the first 24 bits, then we make a lookup in the table and update the partial sum; we repeat this procedure 3 times, every time doing an 8-bit shift to the right. In total we have 10 operations per component. Also this method can be completely vectorized, as you can see from the second kernel reported;

3) Usage of a precomputed 16-bit lookup table (in theory the fastest method): the idea is the same as above, the advantage is that we can analyze larger parts (16 bits instead of 8) of every word, lessening the total number of instructions used (5 operations per component). Also this method is completely vectorizable;


Okay, the problem is that method "2)" is only lightly faster than "1)" and method "3)" is slower than "2)" but also than "1)" !, which is quite nonsensical!
I'm starting to think that we have some inefficiences in current OpenCL implementation, but I'd wish to know from AMD staff why method 2) gives me poor performance results, if compared to what I expected, and, above all, why method 3), which should be the fastest one, is actually the slowest.

Thank you for any answer!

PS: My specs => Ubuntu 9.04 64bit, ATI 5770, Catalyst 10.2 and OpenCL 2.01 implementation.

/** * FIRST KERNEL: using naive-vectorized bit counting method **/ __kernel void intersect(__global uint4 *inputVD, __global uint *inputPrefixCk, __local uint4 *tmpBuffer, __local uint *tmpCandItems, __global uint *outputCounts, const int k, const int numItems, const int dimRowVec4, const int numIntersections) { // Work-group properties; const int gid = get_global_id(0); const int lid = get_local_id(0); const int lsize = get_local_size(0); // Mask used during bit-counting const uint4 MASK = 1; // Counting variable uint4 tmpCOUNTS = 0; // Temporary variable used for AND-intersections computing; uint4 tmpIntersect; // Load of the item of the itemset given to this work-item; int init = (gid < numIntersections) ? 0 : k; for( ; init<k; init++) tmpCandItems[lid + init*lsize] = inputPrefixCk[gid*k + init]; // Beginning of current slice processing for(int i=0; i<dimRowVec4; i++) { // Load of the current slice in local memory init = (lid < numItems) ? lid : numItems; for( ; init<numItems; init+=lsize) tmpBuffer[init] = inputVD[init*dimRowVec4 + i]; barrier(CLK_LOCAL_MEM_FENCE); if(gid < numIntersections) { // First AND-intersection; tmpIntersect = tmpBuffer[tmpCandItems[lid + (k-1)*lsize]] & tmpBuffer[tmpCandItems[lid + (k-2)*lsize]]; // Successive AND-intersections; for(init=(k-3); init>=0; init--) tmpIntersect &= tmpBuffer[tmpCandItems[lid + init*lsize]]; // Naive-vectorized for-loop bit-count method for(init=0; init<32; init++) { tmpCOUNTS += (tmpIntersect & MASK); tmpIntersect = tmpIntersect >> MASK; } } } // End of current slice processing // write final result if(gid < numIntersections) outputCounts[gid] = tmpCOUNTS.x + tmpCOUNTS.y + tmpCOUNTS.z + tmpCOUNTS.w; } /** * SECOND KERNEL: using 8-bit precomputed lookup table **/ __kernel void intersect(__global uint4 *inputVD, __global uint *inputPrefixCk, __local uint4 *tmpBuffer, __local uint *tmpCandItems, __global uint *outputCounts, __constant uchar lookupTable[256], const int k, const int numItems, const int dimRowVec4, const int numIntersections) { // Work-group properties; const int gid = get_global_id(0); const int lid = get_local_id(0); const int lsize = get_local_size(0); // Mask used during bit-counting; const uint4 MASK = 255; // variable containing partial sum; uint4 tmpCOUNTS = 0; // Temporary variable used for AND-intersections computing; uint4 tmpIntersect; // Variable dummy; uint4 dummy; // Variable used when retrieving values from the lookup table; uint4 lookupValues; // Load of the item of the itemset given to this work-item; int init = (gid < numIntersections) ? 0 : k; for( ; init<k; init++) tmpCandItems[lid + init*lsize] = inputPrefixCk[gid*k + init]; // Beginning of current slice processing for(int i=0; i<dimRowVec4; i++) { // Load of the current slice in local memory init = (lid < numItems) ? lid : numItems; for( ; init<numItems; init+=lsize) tmpBuffer[init] = inputVD[init*dimRowVec4 + i]; barrier(CLK_LOCAL_MEM_FENCE); if(gid < numIntersections) { // First AND-intersection; tmpIntersect = tmpBuffer[tmpCandItems[lid + (k-1)*lsize]] & tmpBuffer[tmpCandItems[lid + (k-2)*lsize]]; // Successive AND-intersections; for(init=(k-3); init>=0; init--) tmpIntersect &= tmpBuffer[tmpCandItems[lid + init*lsize]]; // Beginning bit-counting: using precomputed 8-bit lookup table; dummy = tmpIntersect & MASK; lookupValues = (uint4)(lookupTable[dummy.x], lookupTable[dummy.y], lookupTable[dummy.z], lookupTable[dummy.w]); tmpCOUNTS += lookupValues; dummy = (tmpIntersect >> (uint4)8); dummy &= MASK; lookupValues = (uint4)(lookupTable[dummy.x], lookupTable[dummy.y], lookupTable[dummy.z], lookupTable[dummy.w]); tmpCOUNTS += lookupValues; dummy = (tmpIntersect >> (uint4)16); dummy &= MASK; lookupValues = (uint4)(lookupTable[dummy.x], lookupTable[dummy.y], lookupTable[dummy.z], lookupTable[dummy.w]); tmpCOUNTS += lookupValues; dummy = (tmpIntersect >> (uint4)24); dummy &= MASK; lookupValues = (uint4)(lookupTable[dummy.x], lookupTable[dummy.y], lookupTable[dummy.z], lookupTable[dummy.w]); tmpCOUNTS += lookupValues; // END of bit-counting } } // End of current slice processing // write final result if(gid < numIntersections) outputCounts[gid] = tmpCOUNTS.x + tmpCOUNTS.y + tmpCOUNTS.z + tmpCOUNTS.w; } /** * THIRD KERNEL: using 16-bit precomputed lookup table **/ __kernel void intersect(__global uint4 *inputVD, __global uint *inputPrefixCk, __local uint4 *tmpBuffer, __local uint *tmpCandItems, __global uint *outputCounts, __constant uchar lookupTable[65536], const int k, const int numItems, const int dimRowVec4, const int numIntersections) { // Work-group properties; const int gid = get_global_id(0); const int lid = get_local_id(0); const int lsize = get_local_size(0); // Mask used during bit-counting; const uint4 MASK = 65535; // variable containing partial sum; uint4 tmpCOUNTS = 0; // Temporary variable used for AND-intersections computing; uint4 tmpIntersect; // Variable dummy; uint4 dummy; // Variable used when retrieving values from the lookup table; uint4 lookupValues; // Load of the item of the itemset given to this work-item; int init = (gid < numIntersections) ? 0 : k; for( ; init<k; init++) tmpCandItems[lid + init*lsize] = inputPrefixCk[gid*k + init]; // Beginning of current slice processing for(int i=0; i<dimRowVec4; i++) { // Load of the current slice in local memory init = (lid < numItems) ? lid : numItems; for( ; init<numItems; init+=lsize) tmpBuffer[init] = inputVD[init*dimRowVec4 + i]; barrier(CLK_LOCAL_MEM_FENCE); if(gid < numIntersections) { // First AND-intersection; tmpIntersect = tmpBuffer[tmpCandItems[lid + (k-1)*lsize]] & tmpBuffer[tmpCandItems[lid + (k-2)*lsize]]; // Successive AND-intersections; for(init=(k-3); init>=0; init--) tmpIntersect &= tmpBuffer[tmpCandItems[lid + init*lsize]]; // Beginning bit-counting: using precomputed 16-bit lookup table; dummy = tmpIntersect & MASK; lookupValues = (uint4)(lookupTable[dummy.x], lookupTable[dummy.y], lookupTable[dummy.z], lookupTable[dummy.w]); tmpCOUNTS += lookupValues; dummy = (tmpIntersect >> (uint4)16); dummy &= MASK; lookupValues = (uint4)(lookupTable[dummy.x], lookupTable[dummy.y], lookupTable[dummy.z], lookupTable[dummy.w]); tmpCOUNTS += lookupValues; } } // End of current slice processing // write final result if(gid < numIntersections) outputCounts[gid] = tmpCOUNTS.x + tmpCOUNTS.y + tmpCOUNTS.z + tmpCOUNTS.w; }

Outcomes