25 Replies Latest reply on Apr 2, 2010 11:30 AM by noxnet

    cheapo GT120M outperforms HD5850 in prefix sum

    BarnacleJunior

      I've been wrestling with this for a long time, and my cheapo 4 SM/32 core GT 120 Mobile is outperforming my HD 5850 in a basic prefix sum in OpenCL.  I'm using the NV beta 3 GPU SDK with the 159.62 driver, and the 2.0 Stream SDK with the 9.12 hotfix driver.  I'm running Win7 x64.

      On this benchmark, the GT120M sums 275M uints/sec and the HD5850 sums 253M uints/sec.  Please don't point out bank conflicts or weird serialization things.  The 5850 should be at least 10x faster than my GT120M on the provided code.  Also, I've benchmarked parallel code in DX11 cs_5_0, and get a throughput of 3200M uints/sec on my Radeon (and I know even that is coming in too low).  There must be something messed up with the driver or shader compiler.  I'll contribute any more code that would be useful in diagnosing these issues, as I have an application that I really need to get going.

      Thanks,

      .sean

       

      --- scan.cl #define NUM_THREADS (1<< NUM_LEVELS) // 32 channels means shift five and add #define BANK_ADDRESS(i) (i + (i>> 5)) #define LOCAL_SIZE (BANK_ADDRESS(NUM_THREADS)) void ThreadSum(uint tid, __local uint sharedSum[LOCAL_SIZE]) { uint tid2 = BANK_ADDRESS(tid); for(uint d = 0; d < NUM_LEVELS - 1; ++d) { barrier(CLK_LOCAL_MEM_FENCE); uint mask = (2<< d) - 1; uint offset = 1<< d; if(mask == (mask & tid)) sharedSum[tid2] += sharedSum[BANK_ADDRESS(tid - offset)]; } barrier(CLK_LOCAL_MEM_FENCE); if(0 == tid) { uint ai = BANK_ADDRESS(NUM_THREADS / 2 - 1); uint bi = BANK_ADDRESS(NUM_THREADS - 1); uint at = sharedSum[ai]; sharedSum[ai] += sharedSum[bi]; sharedSum[bi] += at + at; } for(uint d = NUM_LEVELS - 1; d; --d) { barrier(CLK_LOCAL_MEM_FENCE); uint mask = (1<< d) - 1; uint offset = 1<< (d - 1); if(mask == (mask & tid)) { uint t = sharedSum[tid2]; uint r = BANK_ADDRESS(tid - offset); sharedSum[tid2] += sharedSum[r]; sharedSum[r] = t; } } barrier(CLK_LOCAL_MEM_FENCE); } uint4 Inclusive4Sum(uint4 vec) { vec.y += vec.x; vec.z += vec.y; vec.w += vec.z; return vec; } uint8 Inclusive8Sum(uint8 vec) { uint8 result; result.lo = Inclusive4Sum(vec.lo); result.hi = Inclusive4Sum(vec.hi) + result.lo.w; return result; } __kernel __attribute__((reqd_work_group_size(NUM_THREADS, 1, 1))) void PrefixSumBlock_Pass1( __global uint* pass1_values, __global uint* pass1_partialSums) { __local uint sharedSum[LOCAL_SIZE]; global uint8* pass1_values_vec8 = (__global uint8*)pass1_values; uint tid = get_local_id(0); uint gid = get_group_id(0); uint index = NUM_THREADS * gid + tid; uint8 a = pass1_values_vec8[index]; uint8 aInc = Inclusive8Sum(a); uint tid2 = BANK_ADDRESS(tid); sharedSum[tid2] = aInc.s7; ThreadSum(tid, sharedSum); uint total = sharedSum[BANK_ADDRESS(0)]; uint aExc = sharedSum[tid2] - total; uint8 sum = aInc - a + aExc; pass1_values_vec8[index] = sum; if(0 == tid) pass1_partialSums[gid] = total; } /////////////////////////////////////////////////////////////////////////////////////////////////// // Finalize __kernel __attribute__((reqd_work_group_size(NUM_THREADS, 1, 1))) void PrefixSumBlock_Finalize( __global uint* finalize_values) { __local uint sharedSum[LOCAL_SIZE]; __global uint8* finalize_values_vec8 = (__global uint8*)finalize_values; uint tid = get_local_id(0); uint gid = get_group_id(0); uint index = NUM_THREADS * gid + tid; uint8 a = finalize_values_vec8[index]; uint8 aInc = Inclusive8Sum(a); uint tid2 = BANK_ADDRESS(tid); sharedSum[tid2] = aInc.s7; ThreadSum(tid, sharedSum); uint total = sharedSum[BANK_ADDRESS(0)]; uint exc = sharedSum[tid2] - total; finalize_values_vec8[index] = aInc - a + (uint8)exc; } /////////////////////////////////////////////////////////////////////////////////////////////////// // Pass 2 __kernel __attribute__((reqd_work_group_size(NUM_THREADS, 1, 1))) void PrefixSumBlock_Pass2( __global uint* pass2_values, __global const uint* pass2_offsets) { __global uint8* pass2_values_vec8 = (__global uint8*)pass2_values; uint tid = get_local_id(0); uint gid = get_group_id(0); uint index = NUM_THREADS * gid + tid; uint8 a = pass2_values_vec8[index]; uint partialSum = pass2_offsets[gid]; pass2_values_vec8[index] = a + (uint8)partialSum; } -- main.cpp #pragma comment(lib, "opencl") #include <windows.h> #include <vector> #include <string> #include <fstream> #include <iostream> #include <cmath> #include <cassert> #define __CL_ENABLE_EXCEPTIONS #include <cl/cl.hpp> typedef unsigned int uint; const size_t NumLoops = 5000; const size_t NumElements = 1<< 20; template<typename T> bool DumpBuffer(cl::CommandQueue& queue, cl::Buffer& buffer, std::vector<T>& data) { size_t size; buffer.getInfo(CL_MEM_SIZE, &size); if(size % sizeof(T)) return false; data.resize(size / sizeof(T)); queue.enqueueReadBuffer(buffer, true, 0, size, &data[0]); return true; } class PrefixSumCache { public: size_t Capacity() const { return _capacity; } cl::Buffer& BaseLevel() { return _levels[0]; } private: friend class CLPrefixSum; std::vector<cl::Buffer> _levels; size_t _capacity, _valuesPerGroup; }; class CLPrefixSum { public: void CreatePrefixSumCache(size_t capacity, std::auto_ptr<PrefixSumCache>* ppCache); void Execute(cl::CommandQueue& queue, size_t numElements, PrefixSumCache* cache); private: friend void CreateCLPrefixSum(cl::Context& context, size_t valuesPerGroup, size_t threadsPerGroup, cl::Kernel& pass1, cl::Kernel& finalize, cl::Kernel& pass2, std::auto_ptr<CLPrefixSum>* ppPrefixSum); static void LevelCounts(size_t numElements, size_t valuesPerGroup, std::vector<size_t>& levelCounts); cl::Context _context; cl::Kernel _pass1, _finalize, _pass2; size_t _valuesPerGroup, _threadsPerGroup; }; void CreateCLPrefixSum(cl::Context& context, size_t valuesPerGroup, size_t threadsPerGroup, cl::Kernel& pass1, cl::Kernel& finalize, cl::Kernel& pass2, std::auto_ptr<CLPrefixSum>* ppPrefixSum) { std::auto_ptr<CLPrefixSum> prefixSum(new CLPrefixSum); prefixSum->_context = context; prefixSum->_pass1 = pass1; prefixSum->_finalize = finalize; prefixSum->_pass2 = pass2; prefixSum->_valuesPerGroup = valuesPerGroup; prefixSum->_threadsPerGroup = threadsPerGroup; *ppPrefixSum = prefixSum; } void CLPrefixSum::LevelCounts(size_t numElements, size_t valuesPerGroup, std::vector<size_t>& levelCounts) { // write the dispatch count for each pass1/pass2 level. while(numElements > valuesPerGroup) { numElements = (numElements + valuesPerGroup - 1) / valuesPerGroup; levelCounts.push_back(numElements); } } void CLPrefixSum::CreatePrefixSumCache(size_t capacity, std::auto_ptr<PrefixSumCache>* ppCache) { std::auto_ptr<PrefixSumCache> cache(new PrefixSumCache); cache->_capacity = capacity; cache->_valuesPerGroup = _valuesPerGroup; std::vector<size_t> levelCounts; LevelCounts(capacity, _valuesPerGroup, levelCounts); capacity = ~(_valuesPerGroup - 1) & (capacity + _valuesPerGroup - 1); cache->_levels.push_back(cl::Buffer(_context, CL_MEM_READ_WRITE, 4 * capacity)); for(size_t i(0); i < levelCounts.size(); ++i) { size_t count = levelCounts[i]; capacity = ~(_valuesPerGroup - 1) & (count + _valuesPerGroup - 1); cache->_levels.push_back(cl::Buffer(_context, CL_MEM_READ_WRITE, 4 * capacity)); } *ppCache = cache; } void CLPrefixSum::Execute(cl::CommandQueue& queue, size_t numElements, PrefixSumCache* cache) { assert(cache->_valuesPerGroup == _valuesPerGroup); std::vector<size_t> levelCounts; LevelCounts(numElements, _valuesPerGroup, levelCounts); for(size_t level(0); level < levelCounts.size(); ++level) { cl::KernelFunctor f(_pass1, queue, cl::NDRange(), levelCounts[level] * _threadsPerGroup, _threadsPerGroup); f(cache->_levels[level], cache->_levels[level + 1]); } cl::KernelFunctor f(_finalize, queue, cl::NDRange(), _threadsPerGroup, _threadsPerGroup); f(cache->_levels.back()); for(int level(levelCounts.size() - 1); level >= 0; --level) { cl::KernelFunctor f(_pass2, queue, cl::NDRange(), levelCounts[level] * _threadsPerGroup, _threadsPerGroup); f(cache->_levels[level], cache->_levels[level + 1]); } } int main(int argc, char** argv) { // read in the file std::ifstream f("scan.cl"); std::string s(std::istreambuf_iterator<char>(f), std::istreambuf_iterator<char>(0)); std::vector<cl::Platform> platforms; cl::Platform::get(&platforms); std::vector<cl::Device> devices; platforms.front().getDevices(CL_DEVICE_TYPE_GPU, &devices); cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[0](), 0 }; cl::Context context(CL_DEVICE_TYPE_GPU, props); cl::CommandQueue commandQueue(context, devices[0]); cl::Program program(context, cl::Program::Sources(1, std::make_pair(s.c_str(), s.size()))); int err = program.build(devices, "-D NUM_LEVELS=7"); if(err) { std::string buildError; program.getBuildInfo(devices[0], CL_PROGRAM_BUILD_LOG, &buildError); std::cout<< buildError; return 0; } cl::Kernel pass1(program, "PrefixSumBlock_Pass1", &err); cl::Kernel finalize(program, "PrefixSumBlock_Finalize", &err); cl::Kernel pass2(program, "PrefixSumBlock_Pass2", &err); const int NumLevels = 7; const int NumThreads = 1<< NumLevels; const int ValuesPerThread = 8; const int ValuesPerGroup = 8 * NumThreads; std::auto_ptr<CLPrefixSum> prefixSum; CreateCLPrefixSum(context, ValuesPerGroup, NumThreads, pass1, finalize, pass2, &prefixSum); std::auto_ptr<PrefixSumCache> prefixSumCache; prefixSum->CreatePrefixSumCache(NumElements, &prefixSumCache); std::vector<uint> values(NumElements, 1); commandQueue.enqueueWriteBuffer(prefixSumCache->BaseLevel(), true, 0, NumElements * 4, &values[0]); commandQueue.finish(); LARGE_INTEGER freq; LARGE_INTEGER begin, end; QueryPerformanceFrequency(&freq); double period = 1.0 / freq.QuadPart; for(int j(0); j < 10; ++j) { QueryPerformanceCounter(&begin); for(int i(0); i < NumLoops; ++i) prefixSum->Execute(commandQueue, NumElements, prefixSumCache.get()); std::vector<uint> values2; DumpBuffer(commandQueue, prefixSumCache->BaseLevel(), values2); QueryPerformanceCounter(&end); double elapsed = period * (end.QuadPart - begin.QuadPart); double velocity = NumElements * (NumLoops / elapsed); printf("GPU velocity: %1.3fM\n", velocity / 1.0e6); } }

        • cheapo GT120M outperforms HD5850 in prefix sum
          genaganna

           

          Originally posted by: BarnacleJunior I've been wrestling with this for a long time, and my cheapo 4 SM/32 core GT 120 Mobile is outperforming my HD 5850 in a basic prefix sum in OpenCL.  I'm using the NV beta 3 GPU SDK with the 159.62 driver, and the 2.0 Stream SDK with the 9.12 hotfix driver.  I'm running Win7 x64.

           

          On this benchmark, the GT120M sums 275M uints/sec and the HD5850 sums 253M uints/sec.  Please don't point out bank conflicts or weird serialization things.  The 5850 should be at least 10x faster than my GT120M on the provided code.  Also, I've benchmarked parallel code in DX11 cs_5_0, and get a throughput of 3200M uints/sec on my Radeon (and I know even that is coming in too low).  There must be something messed up with the driver or shader compiler.  I'll contribute any more code that would be useful in diagnosing these issues, as I have an application that I really need to get going.

           

          Thanks,

           

          .sean

           

           

           

          BarnacleJunior,

                      I am getting 280 uints/sec with Juniper on XP64. I am sure you should get far more than this.

            • cheapo GT120M outperforms HD5850 in prefix sum
              BarnacleJunior

              Thanks for testing that, genaganna.  It almost seems like with that set of kernels, only one of my 18 SIMD engines is running.  Your Juniper runs at a slightly higher clockrate and that could explain why your throughput is a bit better.

              To make sure it wasn't memory bandwidth I tested an extremely simple kernel.. Just read from one array, increment it, and write to another.  I'm getting 116 GB/s consistent throughput, just shy of the 128 GB/s theoretical bandwidth.

              I'm including that code in case anyone else wants to test theirs..

               

              -- copy.cl __kernel __attribute__((reqd_work_group_size(64, 1, 1))) void IncAndCopy(__global const uint* source, __global uint* target) { __global const uint8* source_vec8 = (__global const uint8*)source; __global uint8* target_vec8 = (__global uint8*)target; uint gid = get_group_id(0); uint tid = get_local_id(0); uint offset = 64 * gid + tid; uint8 x = source_vec8[offset]; target_vec8[offset] = x + (uint8)1; } -- copy.cpp #pragma comment(lib, "opencl") #include <windows.h> #include <vector> #include <string> #include <fstream> #include <iostream> #include <cmath> #include <cassert> #define __CL_ENABLE_EXCEPTIONS #include <cl/cl.hpp> typedef unsigned int uint; const size_t NumLoops = 10000; const size_t NumElements = 1<< 24; template<typename T> bool DumpBuffer(cl::CommandQueue& queue, cl::Buffer& buffer, std::vector<T>& data) { size_t size; buffer.getInfo(CL_MEM_SIZE, &size); if(size % sizeof(T)) return false; data.resize(size / sizeof(T)); queue.enqueueReadBuffer(buffer, true, 0, size, &data[0]); return true; } int main(int argc, char** argv) { // read in the file std::ifstream f("copy.cl"); std::string s(std::istreambuf_iterator<char>(f), std::istreambuf_iterator<char>(0)); std::vector<cl::Platform> platforms; cl::Platform::get(&platforms); std::vector<cl::Device> devices; platforms.front().getDevices(CL_DEVICE_TYPE_GPU, &devices); cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[0](), 0 }; cl::Context context(CL_DEVICE_TYPE_GPU, props); cl::CommandQueue commandQueue(context, devices[0]); cl::Program program(context, cl::Program::Sources(1, std::make_pair(s.c_str(), s.size()))); int err = program.build(devices); if(err) { std::string buildError; program.getBuildInfo(devices[0], CL_PROGRAM_BUILD_LOG, &buildError); std::cout<< buildError; return 0; } cl::Kernel k(program, "IncAndCopy", &err); const size_t ValuesPerThread = 8; const size_t ThreadsPerGroup = 64; const size_t ValuesPerGroup = ValuesPerThread * ThreadsPerGroup; std::vector<uint> values1(NumElements, 0), values2(NumElements); cl::Buffer buffer1(context, CL_MEM_READ_WRITE, 4 * NumElements); cl::Buffer buffer2(context, CL_MEM_READ_WRITE, 4 * NumElements); commandQueue.enqueueWriteBuffer(buffer1, true, 0, NumElements * 4, &values1[0]); commandQueue.finish(); LARGE_INTEGER freq, begin, end; QueryPerformanceFrequency(&freq); double period = 1.0 / freq.QuadPart; for(int j(0); j < 10; ++j) { QueryPerformanceCounter(&begin); for(int i(0); i < NumLoops; i += 2) { cl::KernelFunctor f(k, commandQueue, cl::NDRange(), NumElements / ValuesPerGroup, ThreadsPerGroup); f(buffer1, buffer2); f(buffer2, buffer1); } std::vector<uint> result; DumpBuffer(commandQueue, buffer1, result); QueryPerformanceCounter(&end); double elapsed = period * (end.QuadPart - begin.QuadPart); double velocity = NumElements * (NumLoops / elapsed); printf("GPU velocity: %1.3fGB/s\n", velocity * 4 / (1<< 30)); } }

                • cheapo GT120M outperforms HD5850 in prefix sum
                  BarnacleJunior

                  Stupid me!  For the bandwidth test I accidentally issued for the global work size the number of groups, not the number of threads.. So I was overstating the memory bandwidth by a factor of 64!

                  The kernel call should look like
                       cl::KernelFunctor f(k, commandQueue, cl::NDRange(), NumElements / ValuesPerThread,
                             ThreadsPerGroup);
                  f(buffer1, buffer2);
                  f(buffer2, buffer1);

                  With the correct parameters it's only reporting 2GB/s bandwidth.  Why is it so ludicrously slow?  Anyone else willing to try (also set NumLoops=1000 and NumElements = 1<<21 or it takes forever to run).

                  • cheapo GT120M outperforms HD5850 in prefix sum
                    genaganna

                    BarnacleJunior,

                                  Looks like you are facing problem with SIMD's.

                                  Could you please run SDK samples and send performance results?

                                  Run  BinomialOption.exe and with following options and result

                                         -x 1048576 -i 10 -q -t

                      • cheapo GT120M outperforms HD5850 in prefix sum
                        BarnacleJunior

                        C:\codelibs\atistream\samples\opencl\bin\x86>BinomialOption.exe -x 1048576 -i 10 -q -t
                        Executing kernel for 10 iterations
                        -------------------------------------------
                        Option Samples           Time(sec)                KernelTime(sec)          Options/sec

                        1048576                  8.15981                  7.77975                  128505

                          • cheapo GT120M outperforms HD5850 in prefix sum
                            genaganna

                             

                            Originally posted by: BarnacleJunior C:\codelibs\atistream\samples\opencl\bin\x86>BinomialOption.exe -x 1048576 -i 10 -q -t Executing kernel for 10 iterations ------------------------------------------- Option Samples           Time(sec)                KernelTime(sec)          Options/sec 1048576                  8.15981                  7.77975                  128505

                             

                             

                            It looks like your SIMD's working fine. This is what i am getting on Juniper.

                            Executing kernel for 10 iterations
                            -------------------------------------------
                            Option Samples           Time(sec)                KernelTime(sec)          Options/sec
                            1048576                  13.575                   11.9471                  77243

                    • cheapo GT120M outperforms HD5850 in prefix sum
                      MicahVillmow
                      BarnacleJunior,
                      What is your group size and your local size?
                      • cheapo GT120M outperforms HD5850 in prefix sum
                        MicahVillmow
                        This is what I get on a HD5870:
                        GPU velocity: 2564.680M
                        GPU velocity: 2632.006M
                        GPU velocity: 2624.121M
                        GPU velocity: 2611.988M
                        GPU velocity: 2613.841M
                        GPU velocity: 2622.924M
                        GPU velocity: 2624.601M
                        GPU velocity: 2608.794M
                        GPU velocity: 2618.155M
                        GPU velocity: 2609.728M

                        After using atomics to get rid of the flow control inside the loops I get:
                        GPU velocity: 3003.810M
                        GPU velocity: 3093.773M
                        GPU velocity: 3075.038M
                        GPU velocity: 3074.808M
                        GPU velocity: 3078.235M
                        GPU velocity: 3059.833M
                        GPU velocity: 3066.295M
                        GPU velocity: 3066.350M
                        GPU velocity: 3012.033M
                        GPU velocity: 3002.686M

                        A nice 15% improvement.

                        Once this is done, the ISA looks very nice except that you are now bound by your use of barriers.

                        #ifdef USE_ATOMIC_UPDATES #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable #endif #ifndef NUM_LEVELS #define NUM_LEVELS 7 #endif #define NUM_THREADS (1<< NUM_LEVELS) // 32 channels means shift five and add #define BANK_ADDRESS(i) (i + (i>> 5)) #define LOCAL_SIZE (BANK_ADDRESS(NUM_THREADS)) void ThreadSum(uint tid, __local uint sharedSum[LOCAL_SIZE]) { uint tid2 = BANK_ADDRESS(tid); for(uint d = 0; d < NUM_LEVELS - 1; ++d) { barrier(CLK_LOCAL_MEM_FENCE); uint mask = (2<< d) - 1; uint offset = 1<< d; #ifndef USE_ATOMIC_UPDATES if(mask == (mask & tid)) sharedSum[tid2] += sharedSum[BANK_ADDRESS(tid - offset)]; #else bool cond = (mask == (mask & tid)); uint ssum = sharedSum[BANK_ADDRESS(tid - offset)]; uint ss2 = sharedSum[tid2]; uint ssp = ss2 + ssum; uint val = select((uint)0, ssp, cond); atom_add(sharedSum + tid2, val); #endif } barrier(CLK_LOCAL_MEM_FENCE); #ifndef USE_ATOMIC_UPDATES if(0 == tid) { uint ai = BANK_ADDRESS(NUM_THREADS / 2 - 1); uint bi = BANK_ADDRESS(NUM_THREADS - 1); uint at = sharedSum[ai]; sharedSum[ai] += sharedSum[bi]; sharedSum[bi] += at + at; } #else uint ai = BANK_ADDRESS(NUM_THREADS / 2 - 1); uint bi = BANK_ADDRESS(NUM_THREADS - 1); uint at = sharedSum[ai]; uint bt = sharedSum[bt]; uint b = (0 == tid); uint atpbt = at + bt; uint at2pbt = at + at + bt; uint at2 = select((uint)0, atpbt, b); uint bt2 = select((uint)0, at2pbt, b); atom_add(sharedSum + ai, at2); atom_add(sharedSum + bi, bt2); #endif for(uint d = NUM_LEVELS - 1; d; --d) { barrier(CLK_LOCAL_MEM_FENCE); uint mask = (1<< d) - 1; uint offset = 1<< (d - 1); #ifndef USE_ATOMIC_UPDATES if(mask == (mask & tid)) { uint t = sharedSum[tid2]; uint r = BANK_ADDRESS(tid - offset); sharedSum[tid2] += sharedSum[r]; sharedSum[r] = t; } #else uint t = sharedSum[tid2]; uint r = BANK_ADDRESS(tid - offset); uint rdata = sharedSum[r]; uint tpr = t + rdata; uint cond = (mask == (mask & tid)); uint ss2 = select((uint)0, tpr, cond); uint sr = select((uint)0, t, cond); atom_add(sharedSum + tid2, ss2); atom_add(sharedSum + r, sr); #endif } barrier(CLK_LOCAL_MEM_FENCE); } uint4 Inclusive4Sum(uint4 vec) { vec.y += vec.x; vec.z += vec.y; vec.w += vec.z; return vec; } uint8 Inclusive8Sum(uint8 vec) { uint8 result; result.lo = Inclusive4Sum(vec.lo); result.hi = Inclusive4Sum(vec.hi) + result.lo.w; return result; } __kernel __attribute__((reqd_work_group_size(NUM_THREADS, 1, 1))) void PrefixSumBlock_Pass1( __global uint* pass1_values, __global uint* pass1_partialSums) { __local uint sharedSum[LOCAL_SIZE]; global uint8* pass1_values_vec8 = (__global uint8*)pass1_values; uint tid = get_local_id(0); uint gid = get_group_id(0); uint index = NUM_THREADS * gid + tid; uint8 a = pass1_values_vec8[index]; uint8 aInc = Inclusive8Sum(a); uint tid2 = BANK_ADDRESS(tid); sharedSum[tid2] = aInc.s7; ThreadSum(tid, sharedSum); uint total = sharedSum[BANK_ADDRESS(0)]; uint aExc = sharedSum[tid2] - total; uint8 sum = aInc - a + aExc; pass1_values_vec8[index] = sum; if(0 == tid) pass1_partialSums[gid] = total; } /////////////////////////////////////////////////////////////////////////////////////////////////// // Finalize __kernel __attribute__((reqd_work_group_size(NUM_THREADS, 1, 1))) void PrefixSumBlock_Finalize( __global uint* finalize_values) { __local uint sharedSum[LOCAL_SIZE]; __global uint8* finalize_values_vec8 = (__global uint8*)finalize_values; uint tid = get_local_id(0); uint gid = get_group_id(0); uint index = NUM_THREADS * gid + tid; uint8 a = finalize_values_vec8[index]; uint8 aInc = Inclusive8Sum(a); uint tid2 = BANK_ADDRESS(tid); sharedSum[tid2] = aInc.s7; ThreadSum(tid, sharedSum); uint total = sharedSum[BANK_ADDRESS(0)]; uint exc = sharedSum[tid2] - total; finalize_values_vec8[index] = aInc - a + (uint8)exc; } /////////////////////////////////////////////////////////////////////////////////////////////////// // Pass 2 __kernel __attribute__((reqd_work_group_size(NUM_THREADS, 1, 1))) void PrefixSumBlock_Pass2( __global uint* pass2_values, __global const uint* pass2_offsets) { __global uint8* pass2_values_vec8 = (__global uint8*)pass2_values; uint tid = get_local_id(0); uint gid = get_group_id(0); uint index = NUM_THREADS * gid + tid; uint8 a = pass2_values_vec8[index]; uint partialSum = pass2_offsets[gid]; pass2_values_vec8[index] = a + (uint8)partialSum; }

                        • cheapo GT120M outperforms HD5850 in prefix sum
                          MicahVillmow
                          After looking closer at the ISA in the case I pasted above, the problem is that ALU packing ratio is only about 2.17 out of 5. There is 302 ALU instructions and 139 ALU bundles. So you are only utilizing about 43% of the capacity of the chip. This is fairly low and you need to increase the parallelization of the kernel. If you remove barriers, this improves overall performance by about 10% and increases utilization to 2.9 out of 5, or about 58%. This is still pretty low as most video games average between 3.8 and 4.1.

                          Is it possible to have ThreadSum work on 2 or 4 or 8 data points in parallel? This would help with performance.
                            • cheapo GT120M outperforms HD5850 in prefix sum
                              BarnacleJunior

                              Thanks for running that.  There is something wrong with the driver then, not my code.  5850 is obviously not 10x slower than 5870.  I am using your kernel code with the intrinsics:

                              GPU velocity: 260.555M
                              GPU velocity: 260.693M
                              GPU velocity: 260.320M
                              GPU velocity: 258.789M
                              GPU velocity: 259.763M
                              GPU velocity: 261.208M
                              GPU velocity: 261.393M
                              GPU velocity: 261.182M
                              GPU velocity: 261.354M
                              GPU velocity: 261.292M

                              OpenCL is reporting 18 for CL_DEVICE_MAX_COMPUTE_UNITS, and the performance in D3D11 is 3200M/sec.  Here's the version info from Catalyst Control Center:

                              Driver Packaging Version    8.681-091124a-092499C-ATI   
                              Catalyst™ Version    09.12   
                              Provider    ATI Technologies Inc.   
                              2D Driver Version    8.01.01.984   
                              2D Driver File Path    /REGISTRY/MACHINE/SYSTEM/ControlSet001/Control/CLASS/{4D36E968-E325-11CE-BFC1-08002BE10318}/0000   
                              Direct3D Version    8.14.10.0716   
                              OpenGL Version    6.14.10.9232   
                              Catalyst™ Control Center Version    2009.1214.1801.32312

                               

                              • cheapo GT120M outperforms HD5850 in prefix sum
                                BarnacleJunior

                                So interesting!  I'm on Win7 x64.  The 32bit builds, that I've been using until now, all perform at ~260M uints/sec.  I just tried linking to your 64bit libraries:

                                My kernel without intrinsics:

                                GPU velocity: 3822.475M
                                GPU velocity: 3913.279M
                                GPU velocity: 3915.547M
                                GPU velocity: 3918.406M
                                GPU velocity: 3911.325M
                                GPU velocity: 3911.484M
                                GPU velocity: 3910.957M
                                GPU velocity: 3908.489M
                                GPU velocity: 3912.629M
                                GPU velocity: 3905.520M

                                 

                                Your kernel with intrinsics:

                                GPU velocity: 3826.765M
                                GPU velocity: 3919.825M
                                GPU velocity: 3918.040M
                                GPU velocity: 3917.697M
                                GPU velocity: 3916.497M
                                GPU velocity: 3914.409M
                                GPU velocity: 3911.509M
                                GPU velocity: 3911.349M
                                GPU velocity: 3913.832M
                                GPU velocity: 3912.431M

                                This is encouraging.  I'll just exclusively use the 64bit builds until Catalyst 9.13

                                 

                                  • cheapo GT120M outperforms HD5850 in prefix sum
                                    BarnacleJunior

                                    For comparison purposes, equivalent prefix sum in

                                    OpenCL 32bit:

                                    GPU velocity: 260.555M
                                    GPU velocity: 260.693M
                                    GPU velocity: 260.320M
                                    GPU velocity: 258.789M
                                    GPU velocity: 259.763M
                                    GPU velocity: 261.208M
                                    GPU velocity: 261.393M
                                    GPU velocity: 261.182M
                                    GPU velocity: 261.354M
                                    GPU velocity: 261.292M

                                     

                                    OpenCL 64bit:

                                     

                                    GPU velocity: 3822.475M
                                    GPU velocity: 3913.279M
                                    GPU velocity: 3915.547M
                                    GPU velocity: 3918.406M
                                    GPU velocity: 3911.325M
                                    GPU velocity: 3911.484M
                                    GPU velocity: 3910.957M
                                    GPU velocity: 3908.489M
                                    GPU velocity: 3912.629M
                                    GPU velocity: 3905.520M

                                    D3D11 32bit:

                                    GPU velocity: 3733.582M
                                    GPU velocity: 3910.886M
                                    GPU velocity: 3910.342M
                                    GPU velocity: 3911.228M
                                    GPU velocity: 3910.527M
                                    GPU velocity: 3910.804M
                                    GPU velocity: 3911.896M
                                    GPU velocity: 3911.685M
                                    GPU velocity: 3909.983M
                                    GPU velocity: 3906.159M

                                    D3D11 64bit:

                                    GPU velocity: 3741.084M
                                    GPU velocity: 3911.670M
                                    GPU velocity: 3912.159M
                                    GPU velocity: 3912.828M
                                    GPU velocity: 3912.276M
                                    GPU velocity: 3913.058M
                                    GPU velocity: 3911.648M
                                    GPU velocity: 3912.201M
                                    GPU velocity: 3912.554M
                                    GPU velocity: 3909.024M

                                • cheapo GT120M outperforms HD5850 in prefix sum
                                  MicahVillmow
                                  I've reported the performance delta issue between the 32bit and 64bit dll's so maybe we can figure out exactly what is going wrong. Enjoy the new year!
                                    • cheapo GT120M outperforms HD5850 in prefix sum
                                      BarnacleJunior

                                      Ok Micah. Thanks for your help.  Have a good new years too.

                                      • cheapo GT120M outperforms HD5850 in prefix sum
                                        apollo_maverick

                                        weird,  followed by my result:

                                        GPU velocity: 4507.252M
                                        GPU velocity: 4802.451M
                                        GPU velocity: 4789.285M
                                        GPU velocity: 4801.221M
                                        GPU velocity: 4784.729M
                                        GPU velocity: 4785.236M
                                        GPU velocity: 4784.610M
                                        GPU velocity: 4781.841M
                                        GPU velocity: 4775.066M
                                        GPU velocity: 4797.403M

                                        significant fast than results which submit by you guys with same chip model 5870, and i wanna know this's why, thanks

                                        btw: my system config: Windows 7 x64; i7 965 EE @3.6GHz; 12GB DDR3 @1600MHz, dual SATA HDD RAID 0; HD 5870 1GB

                                        and program compiled target is x64

                                          • cheapo GT120M outperforms HD5850 in prefix sum
                                            BarnacleJunior

                                             

                                            Originally posted by: apollo_maverick weird,  followed by my result:

                                             

                                            GPU velocity: 4507.252M GPU velocity: 4802.451M GPU velocity: 4789.285M GPU velocity: 4801.221M GPU velocity: 4784.729M GPU velocity: 4785.236M GPU velocity: 4784.610M GPU velocity: 4781.841M GPU velocity: 4775.066M GPU velocity: 4797.403M

                                             

                                            significant fast than results which submit by you guys with same chip model 5870, and i wanna know this's why, thanks

                                             

                                            btw: my system config: Windows 7 x64; i7 965 EE @3.6GHz; 12GB DDR3 @1600MHz, dual SATA HDD RAID 0; HD 5870 1GB

                                             

                                            and program compiled target is x64

                                             

                                            The numbers you just posted are consistent with the ones I had been getting in D3D and 64bit OpenCL: your card is about 25% faster and so are your results.

                                            However since then I have written truly optimized prefix sum (and nearly done with an extremely efficient radix sum) for D3D11.  Getting 6000M uints/sec for small arrays (512k uints) and more than 7000M for arrays of 4M elements.

                                            I think the OpenCL drivers just suck.  I've got optimized prefix sum and have nearly finished an optimized radix sort for D3D11.  The prefix sum is doing 6000M uints/sec for 1 million element array, and 7000M+ uints/sec for 4 million element arrays.

                                            Also that code I posted is pretty stupid, because the first pass outputs the scanned array to the UAV.  The card is very much write bandwidth limited.  In my current code (it's a mess of macros so I won't paste it here yet) I only write on the second pass.

                                            Have you run any memory bandwidth tests?  Seems my HD5850 does 100GB/s read and 42GB/s write.  They are concurrent though, so if you read and write in the same shader the effective bandwidth is 42GB.

                                              • cheapo GT120M outperforms HD5850 in prefix sum
                                                apollo_maverick

                                                BarnacleJunior, i've run your memory bandwidth test code in this thread, i got result of about 56GB/s

                                                  • cheapo GT120M outperforms HD5850 in prefix sum
                                                    jeff_golds

                                                    I ran the test on both 32- and 64-bit and am getting about 56 GB/s on a HD5870.  Note that since you are doing reads *and* writes you should count the total bandwidth used.  Thus, you're actually hitting 112 GB/s.

                                                      • cheapo GT120M outperforms HD5850 in prefix sum
                                                        BarnacleJunior

                                                         

                                                        Originally posted by: jeff_golds I ran the test on both 32- and 64-bit and am getting about 56 GB/s on a HD5870.  Note that since you are doing reads *and* writes you should count the total bandwidth used.  Thus, you're actually hitting 112 GB/s.

                                                         

                                                        The reason for the performance was a bug in the catalyst driver.  However as far as counting bandwidth (I do most my work in D3D11 which seems to have more reliable drivers, but far from perfect), the read bandwidth (on HD5850) is 111 GB/s and the write bandwidth is 42GB/s.  It doesn't appear to involve a sum in a very clear way.  I can achieve 111 GB/s even when writing a significant amount of data.  I don't know but I think the read and write operate more or less independently, and you are bottlenecked by the slower one.  I've managed to write a radix sort that does 329M pairs/sec and 408M uints/sec by being exploiting wavefront concurrency, not doing any divergent branching, and being very parsimonious when it comes to global writes.  Reads are way, way cheaper.

                                                          • cheapo GT120M outperforms HD5850 in prefix sum
                                                            jeff_golds

                                                            Reads and writes both use bandwidth.  Thus, when you are doing both, you need to count the total bandwidth used.

                                                            [code]

                                                                  double elapsed = period * (end.QuadPart - begin.QuadPart);

                                                                  double velocity = NumElements * (NumLoops / elapsed);
                                                                  printf("GPU velocity: %1.3fGB/s\n", velocity * 4 / (1<< 30));

                                                            [/code]

                                                            You need to add a "*2" someplace in your equation to report the bandwidth used, otherwise you are misrepresenting how much bandwidth you are using.  Maybe something like this:

                                                            [code]

                                                                int readBytes = sizeof(cl_uint);

                                                                int writeBytes = sizeof(cl_uint);

                                                                  double elapsed = period * (end.QuadPart - begin.QuadPart);

                                                                  double velocity = NumElements * (NumLoops / elapsed);
                                                                  printf("GPU velocity: %1.3fGB/s\n", velocity * (readBytes + writeBytes) / (1<< 30));

                                                            [/code]

                                                            You can't expect to achieve the same write bandwidth while doing reads and writes as you achieve with writes only.

                                                              • cheapo GT120M outperforms HD5850 in prefix sum
                                                                hocheng

                                                                I'm intresting at this topic too.  I testing on my machine: HD 5870+ I7 2.67G Q-core cpu + win7 64OS, and got GPU velocity: 4704.873M
                                                                GPU velocity: 4929.441M
                                                                GPU velocity: 4930.918M
                                                                GPU velocity: 4941.142M
                                                                GPU velocity: 4932.678M
                                                                GPU velocity: 4937.237M
                                                                GPU velocity: 4942.717M
                                                                GPU velocity: 4931.638M
                                                                GPU velocity: 4906.104M
                                                                GPU velocity: 4911.905M

                                                                BTW, There was almost no change when I use the automics kernel from MicahVillmow instead.

                                                            • cheapo GT120M outperforms HD5850 in prefix sum
                                                              noxnet

                                                              I ran the benchmarks on a HD5750 with following results

                                                              Copy:

                                                              32-Bit
                                                              GPU velocity: 36,518GB/s
                                                              GPU velocity: 47,213GB/s
                                                              GPU velocity: 46,405GB/s
                                                              GPU velocity: 47,128GB/s
                                                              GPU velocity: 47,504GB/s
                                                              GPU velocity: 46,260GB/s
                                                              GPU velocity: 47,171GB/s
                                                              GPU velocity: 47,129GB/s
                                                              GPU velocity: 47,339GB/s
                                                              GPU velocity: 46,899GB/s

                                                              64-Bit
                                                              GPU velocity: 17,410GB/s
                                                              GPU velocity: 26,471GB/s
                                                              GPU velocity: 25,673GB/s
                                                              GPU velocity: 26,436GB/s
                                                              GPU velocity: 26,473GB/s
                                                              GPU velocity: 26,417GB/s
                                                              GPU velocity: 26,516GB/s
                                                              GPU velocity: 26,512GB/s
                                                              GPU velocity: 26,138GB/s
                                                              GPU velocity: 26,556GB/s

                                                              Scan:
                                                              32-Bit
                                                              GPU velocity: 116,712M
                                                              GPU velocity: 121,473M
                                                              GPU velocity: 121,298M
                                                              GPU velocity: 121,555M
                                                              GPU velocity: 121,409M
                                                              GPU velocity: 121,598M
                                                              GPU velocity: 120,943M
                                                              GPU velocity: 119,926M
                                                              GPU velocity: 121,391M
                                                              GPU velocity: 120,498M

                                                              64-Bit
                                                              GPU velocity: 95,510M
                                                              GPU velocity: 121,505M
                                                              GPU velocity: 121,431M
                                                              GPU velocity: 121,573M
                                                              GPU velocity: 119,621M
                                                              GPU velocity: 121,465M
                                                              GPU velocity: 121,393M
                                                              GPU velocity: 120,325M
                                                              GPU velocity: 120,980M
                                                              GPU velocity: 120,731M

                                                              When running SDK Sample BinominalOption with suggested parameters (BinomialOption.exe -x 1048576 -i 10 -q -t)
                                                              my screen gets black and then i get a windows message that the display driver has stopped working correctly
                                                              and has recovered. When running with other parameters (BinomialOption.exe -x 524288 -i 10 -q -t) i get the following result

                                                              Executing kernel for 10 iterations
                                                              -------------------------------------------
                                                              Option Samples           Time(sec)                KernelTime(sec)          Options/sec

                                                              524288                  8.58333                  8.0323                  61082.1

                                                              Some ideas why the performance of the HD5750 is so bad on my system?


                                                              System:
                                                              ----------------------------------------------------------------
                                                              Winows 7 64-Bit
                                                              Stream SDK 2.0.1 64-Bit
                                                              Visual Studio VC++ Express
                                                              Catalyst 10.3


                                                              OpenCL Query results:
                                                              ----------------------------------------------------------------
                                                              Platform Name:   ATI Stream
                                                              Platform Version:  OpenCL 1.0 ATI-Stream-v2.0.1
                                                              Vendor:   Advanced Micro Devices, Inc.
                                                              Device Name:   Juniper

                                                              Profile:   FULL_PROFILE
                                                              Supported Extensions:  cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics

                                                              Local Mem Type (Local=1, Global=2): 1
                                                              Local Mem Size (KB):    32
                                                              Global Mem Size (MB):   256
                                                              Global Mem Cache Size (Bytes):  0
                                                              Clock Frequency (MHz):   700
                                                              Max Work Group Size:   256
                                                              Address Bits:    32
                                                              Max Compute Units:   9

                                                              Vector type width for: char =  16
                                                              Vector type width for: short =  8
                                                              Vector type width for: int =  4
                                                              Vector type width for: long =  2
                                                              Vector type width for: float =  4
                                                              Vector type width for: double =  0