BarnacleJunior

cheapo GT120M outperforms HD5850 in prefix sum

Discussion created by BarnacleJunior on Dec 31, 2009
Latest reply on Apr 2, 2010 by noxnet

I've been wrestling with this for a long time, and my cheapo 4 SM/32 core GT 120 Mobile is outperforming my HD 5850 in a basic prefix sum in OpenCL.  I'm using the NV beta 3 GPU SDK with the 159.62 driver, and the 2.0 Stream SDK with the 9.12 hotfix driver.  I'm running Win7 x64.

On this benchmark, the GT120M sums 275M uints/sec and the HD5850 sums 253M uints/sec.  Please don't point out bank conflicts or weird serialization things.  The 5850 should be at least 10x faster than my GT120M on the provided code.  Also, I've benchmarked parallel code in DX11 cs_5_0, and get a throughput of 3200M uints/sec on my Radeon (and I know even that is coming in too low).  There must be something messed up with the driver or shader compiler.  I'll contribute any more code that would be useful in diagnosing these issues, as I have an application that I really need to get going.

Thanks,

.sean

 

--- scan.cl #define NUM_THREADS (1<< NUM_LEVELS) // 32 channels means shift five and add #define BANK_ADDRESS(i) (i + (i>> 5)) #define LOCAL_SIZE (BANK_ADDRESS(NUM_THREADS)) void ThreadSum(uint tid, __local uint sharedSum[LOCAL_SIZE]) { uint tid2 = BANK_ADDRESS(tid); for(uint d = 0; d < NUM_LEVELS - 1; ++d) { barrier(CLK_LOCAL_MEM_FENCE); uint mask = (2<< d) - 1; uint offset = 1<< d; if(mask == (mask & tid)) sharedSum[tid2] += sharedSum[BANK_ADDRESS(tid - offset)]; } barrier(CLK_LOCAL_MEM_FENCE); if(0 == tid) { uint ai = BANK_ADDRESS(NUM_THREADS / 2 - 1); uint bi = BANK_ADDRESS(NUM_THREADS - 1); uint at = sharedSum[ai]; sharedSum[ai] += sharedSum[bi]; sharedSum[bi] += at + at; } for(uint d = NUM_LEVELS - 1; d; --d) { barrier(CLK_LOCAL_MEM_FENCE); uint mask = (1<< d) - 1; uint offset = 1<< (d - 1); if(mask == (mask & tid)) { uint t = sharedSum[tid2]; uint r = BANK_ADDRESS(tid - offset); sharedSum[tid2] += sharedSum[r]; sharedSum[r] = t; } } barrier(CLK_LOCAL_MEM_FENCE); } uint4 Inclusive4Sum(uint4 vec) { vec.y += vec.x; vec.z += vec.y; vec.w += vec.z; return vec; } uint8 Inclusive8Sum(uint8 vec) { uint8 result; result.lo = Inclusive4Sum(vec.lo); result.hi = Inclusive4Sum(vec.hi) + result.lo.w; return result; } __kernel __attribute__((reqd_work_group_size(NUM_THREADS, 1, 1))) void PrefixSumBlock_Pass1( __global uint* pass1_values, __global uint* pass1_partialSums) { __local uint sharedSum[LOCAL_SIZE]; global uint8* pass1_values_vec8 = (__global uint8*)pass1_values; uint tid = get_local_id(0); uint gid = get_group_id(0); uint index = NUM_THREADS * gid + tid; uint8 a = pass1_values_vec8[index]; uint8 aInc = Inclusive8Sum(a); uint tid2 = BANK_ADDRESS(tid); sharedSum[tid2] = aInc.s7; ThreadSum(tid, sharedSum); uint total = sharedSum[BANK_ADDRESS(0)]; uint aExc = sharedSum[tid2] - total; uint8 sum = aInc - a + aExc; pass1_values_vec8[index] = sum; if(0 == tid) pass1_partialSums[gid] = total; } /////////////////////////////////////////////////////////////////////////////////////////////////// // Finalize __kernel __attribute__((reqd_work_group_size(NUM_THREADS, 1, 1))) void PrefixSumBlock_Finalize( __global uint* finalize_values) { __local uint sharedSum[LOCAL_SIZE]; __global uint8* finalize_values_vec8 = (__global uint8*)finalize_values; uint tid = get_local_id(0); uint gid = get_group_id(0); uint index = NUM_THREADS * gid + tid; uint8 a = finalize_values_vec8[index]; uint8 aInc = Inclusive8Sum(a); uint tid2 = BANK_ADDRESS(tid); sharedSum[tid2] = aInc.s7; ThreadSum(tid, sharedSum); uint total = sharedSum[BANK_ADDRESS(0)]; uint exc = sharedSum[tid2] - total; finalize_values_vec8[index] = aInc - a + (uint8)exc; } /////////////////////////////////////////////////////////////////////////////////////////////////// // Pass 2 __kernel __attribute__((reqd_work_group_size(NUM_THREADS, 1, 1))) void PrefixSumBlock_Pass2( __global uint* pass2_values, __global const uint* pass2_offsets) { __global uint8* pass2_values_vec8 = (__global uint8*)pass2_values; uint tid = get_local_id(0); uint gid = get_group_id(0); uint index = NUM_THREADS * gid + tid; uint8 a = pass2_values_vec8[index]; uint partialSum = pass2_offsets[gid]; pass2_values_vec8[index] = a + (uint8)partialSum; } -- main.cpp #pragma comment(lib, "opencl") #include <windows.h> #include <vector> #include <string> #include <fstream> #include <iostream> #include <cmath> #include <cassert> #define __CL_ENABLE_EXCEPTIONS #include <cl/cl.hpp> typedef unsigned int uint; const size_t NumLoops = 5000; const size_t NumElements = 1<< 20; template<typename T> bool DumpBuffer(cl::CommandQueue& queue, cl::Buffer& buffer, std::vector<T>& data) { size_t size; buffer.getInfo(CL_MEM_SIZE, &size); if(size % sizeof(T)) return false; data.resize(size / sizeof(T)); queue.enqueueReadBuffer(buffer, true, 0, size, &data[0]); return true; } class PrefixSumCache { public: size_t Capacity() const { return _capacity; } cl::Buffer& BaseLevel() { return _levels[0]; } private: friend class CLPrefixSum; std::vector<cl::Buffer> _levels; size_t _capacity, _valuesPerGroup; }; class CLPrefixSum { public: void CreatePrefixSumCache(size_t capacity, std::auto_ptr<PrefixSumCache>* ppCache); void Execute(cl::CommandQueue& queue, size_t numElements, PrefixSumCache* cache); private: friend void CreateCLPrefixSum(cl::Context& context, size_t valuesPerGroup, size_t threadsPerGroup, cl::Kernel& pass1, cl::Kernel& finalize, cl::Kernel& pass2, std::auto_ptr<CLPrefixSum>* ppPrefixSum); static void LevelCounts(size_t numElements, size_t valuesPerGroup, std::vector<size_t>& levelCounts); cl::Context _context; cl::Kernel _pass1, _finalize, _pass2; size_t _valuesPerGroup, _threadsPerGroup; }; void CreateCLPrefixSum(cl::Context& context, size_t valuesPerGroup, size_t threadsPerGroup, cl::Kernel& pass1, cl::Kernel& finalize, cl::Kernel& pass2, std::auto_ptr<CLPrefixSum>* ppPrefixSum) { std::auto_ptr<CLPrefixSum> prefixSum(new CLPrefixSum); prefixSum->_context = context; prefixSum->_pass1 = pass1; prefixSum->_finalize = finalize; prefixSum->_pass2 = pass2; prefixSum->_valuesPerGroup = valuesPerGroup; prefixSum->_threadsPerGroup = threadsPerGroup; *ppPrefixSum = prefixSum; } void CLPrefixSum::LevelCounts(size_t numElements, size_t valuesPerGroup, std::vector<size_t>& levelCounts) { // write the dispatch count for each pass1/pass2 level. while(numElements > valuesPerGroup) { numElements = (numElements + valuesPerGroup - 1) / valuesPerGroup; levelCounts.push_back(numElements); } } void CLPrefixSum::CreatePrefixSumCache(size_t capacity, std::auto_ptr<PrefixSumCache>* ppCache) { std::auto_ptr<PrefixSumCache> cache(new PrefixSumCache); cache->_capacity = capacity; cache->_valuesPerGroup = _valuesPerGroup; std::vector<size_t> levelCounts; LevelCounts(capacity, _valuesPerGroup, levelCounts); capacity = ~(_valuesPerGroup - 1) & (capacity + _valuesPerGroup - 1); cache->_levels.push_back(cl::Buffer(_context, CL_MEM_READ_WRITE, 4 * capacity)); for(size_t i(0); i < levelCounts.size(); ++i) { size_t count = levelCounts[i]; capacity = ~(_valuesPerGroup - 1) & (count + _valuesPerGroup - 1); cache->_levels.push_back(cl::Buffer(_context, CL_MEM_READ_WRITE, 4 * capacity)); } *ppCache = cache; } void CLPrefixSum::Execute(cl::CommandQueue& queue, size_t numElements, PrefixSumCache* cache) { assert(cache->_valuesPerGroup == _valuesPerGroup); std::vector<size_t> levelCounts; LevelCounts(numElements, _valuesPerGroup, levelCounts); for(size_t level(0); level < levelCounts.size(); ++level) { cl::KernelFunctor f(_pass1, queue, cl::NDRange(), levelCounts[level] * _threadsPerGroup, _threadsPerGroup); f(cache->_levels[level], cache->_levels[level + 1]); } cl::KernelFunctor f(_finalize, queue, cl::NDRange(), _threadsPerGroup, _threadsPerGroup); f(cache->_levels.back()); for(int level(levelCounts.size() - 1); level >= 0; --level) { cl::KernelFunctor f(_pass2, queue, cl::NDRange(), levelCounts[level] * _threadsPerGroup, _threadsPerGroup); f(cache->_levels[level], cache->_levels[level + 1]); } } int main(int argc, char** argv) { // read in the file std::ifstream f("scan.cl"); std::string s(std::istreambuf_iterator<char>(f), std::istreambuf_iterator<char>(0)); std::vector<cl::Platform> platforms; cl::Platform::get(&platforms); std::vector<cl::Device> devices; platforms.front().getDevices(CL_DEVICE_TYPE_GPU, &devices); cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[0](), 0 }; cl::Context context(CL_DEVICE_TYPE_GPU, props); cl::CommandQueue commandQueue(context, devices[0]); cl::Program program(context, cl::Program::Sources(1, std::make_pair(s.c_str(), s.size()))); int err = program.build(devices, "-D NUM_LEVELS=7"); if(err) { std::string buildError; program.getBuildInfo(devices[0], CL_PROGRAM_BUILD_LOG, &buildError); std::cout<< buildError; return 0; } cl::Kernel pass1(program, "PrefixSumBlock_Pass1", &err); cl::Kernel finalize(program, "PrefixSumBlock_Finalize", &err); cl::Kernel pass2(program, "PrefixSumBlock_Pass2", &err); const int NumLevels = 7; const int NumThreads = 1<< NumLevels; const int ValuesPerThread = 8; const int ValuesPerGroup = 8 * NumThreads; std::auto_ptr<CLPrefixSum> prefixSum; CreateCLPrefixSum(context, ValuesPerGroup, NumThreads, pass1, finalize, pass2, &prefixSum); std::auto_ptr<PrefixSumCache> prefixSumCache; prefixSum->CreatePrefixSumCache(NumElements, &prefixSumCache); std::vector<uint> values(NumElements, 1); commandQueue.enqueueWriteBuffer(prefixSumCache->BaseLevel(), true, 0, NumElements * 4, &values[0]); commandQueue.finish(); LARGE_INTEGER freq; LARGE_INTEGER begin, end; QueryPerformanceFrequency(&freq); double period = 1.0 / freq.QuadPart; for(int j(0); j < 10; ++j) { QueryPerformanceCounter(&begin); for(int i(0); i < NumLoops; ++i) prefixSum->Execute(commandQueue, NumElements, prefixSumCache.get()); std::vector<uint> values2; DumpBuffer(commandQueue, prefixSumCache->BaseLevel(), values2); QueryPerformanceCounter(&end); double elapsed = period * (end.QuadPart - begin.QuadPart); double velocity = NumElements * (NumLoops / elapsed); printf("GPU velocity: %1.3fM\n", velocity / 1.0e6); } }

Outcomes