cancel
Showing results for 
Search instead for 
Did you mean: 

Archives Discussions

BarnacleJunior
Journeyman III

cheapo GT120M outperforms HD5850 in prefix sum

I've been wrestling with this for a long time, and my cheapo 4 SM/32 core GT 120 Mobile is outperforming my HD 5850 in a basic prefix sum in OpenCL.  I'm using the NV beta 3 GPU SDK with the 159.62 driver, and the 2.0 Stream SDK with the 9.12 hotfix driver.  I'm running Win7 x64.

On this benchmark, the GT120M sums 275M uints/sec and the HD5850 sums 253M uints/sec.  Please don't point out bank conflicts or weird serialization things.  The 5850 should be at least 10x faster than my GT120M on the provided code.  Also, I've benchmarked parallel code in DX11 cs_5_0, and get a throughput of 3200M uints/sec on my Radeon (and I know even that is coming in too low).  There must be something messed up with the driver or shader compiler.  I'll contribute any more code that would be useful in diagnosing these issues, as I have an application that I really need to get going.

Thanks,

.sean

 

--- scan.cl #define NUM_THREADS (1<< NUM_LEVELS) // 32 channels means shift five and add #define BANK_ADDRESS(i) (i + (i>> 5)) #define LOCAL_SIZE (BANK_ADDRESS(NUM_THREADS)) void ThreadSum(uint tid, __local uint sharedSum[LOCAL_SIZE]) { uint tid2 = BANK_ADDRESS(tid); for(uint d = 0; d < NUM_LEVELS - 1; ++d) { barrier(CLK_LOCAL_MEM_FENCE); uint mask = (2<< d) - 1; uint offset = 1<< d; if(mask == (mask & tid)) sharedSum[tid2] += sharedSum[BANK_ADDRESS(tid - offset)]; } barrier(CLK_LOCAL_MEM_FENCE); if(0 == tid) { uint ai = BANK_ADDRESS(NUM_THREADS / 2 - 1); uint bi = BANK_ADDRESS(NUM_THREADS - 1); uint at = sharedSum[ai]; sharedSum[ai] += sharedSum[bi]; sharedSum[bi] += at + at; } for(uint d = NUM_LEVELS - 1; d; --d) { barrier(CLK_LOCAL_MEM_FENCE); uint mask = (1<< d) - 1; uint offset = 1<< (d - 1); if(mask == (mask & tid)) { uint t = sharedSum[tid2]; uint r = BANK_ADDRESS(tid - offset); sharedSum[tid2] += sharedSum; sharedSum = t; } } barrier(CLK_LOCAL_MEM_FENCE); } uint4 Inclusive4Sum(uint4 vec) { vec.y += vec.x; vec.z += vec.y; vec.w += vec.z; return vec; } uint8 Inclusive8Sum(uint8 vec) { uint8 result; result.lo = Inclusive4Sum(vec.lo); result.hi = Inclusive4Sum(vec.hi) + result.lo.w; return result; } __kernel __attribute__((reqd_work_group_size(NUM_THREADS, 1, 1))) void PrefixSumBlock_Pass1( __global uint* pass1_values, __global uint* pass1_partialSums) { __local uint sharedSum[LOCAL_SIZE]; global uint8* pass1_values_vec8 = (__global uint8*)pass1_values; uint tid = get_local_id(0); uint gid = get_group_id(0); uint index = NUM_THREADS * gid + tid; uint8 a = pass1_values_vec8[index]; uint8 aInc = Inclusive8Sum(a); uint tid2 = BANK_ADDRESS(tid); sharedSum[tid2] = aInc.s7; ThreadSum(tid, sharedSum); uint total = sharedSum[BANK_ADDRESS(0)]; uint aExc = sharedSum[tid2] - total; uint8 sum = aInc - a + aExc; pass1_values_vec8[index] = sum; if(0 == tid) pass1_partialSums[gid] = total; } /////////////////////////////////////////////////////////////////////////////////////////////////// // Finalize __kernel __attribute__((reqd_work_group_size(NUM_THREADS, 1, 1))) void PrefixSumBlock_Finalize( __global uint* finalize_values) { __local uint sharedSum[LOCAL_SIZE]; __global uint8* finalize_values_vec8 = (__global uint8*)finalize_values; uint tid = get_local_id(0); uint gid = get_group_id(0); uint index = NUM_THREADS * gid + tid; uint8 a = finalize_values_vec8[index]; uint8 aInc = Inclusive8Sum(a); uint tid2 = BANK_ADDRESS(tid); sharedSum[tid2] = aInc.s7; ThreadSum(tid, sharedSum); uint total = sharedSum[BANK_ADDRESS(0)]; uint exc = sharedSum[tid2] - total; finalize_values_vec8[index] = aInc - a + (uint8)exc; } /////////////////////////////////////////////////////////////////////////////////////////////////// // Pass 2 __kernel __attribute__((reqd_work_group_size(NUM_THREADS, 1, 1))) void PrefixSumBlock_Pass2( __global uint* pass2_values, __global const uint* pass2_offsets) { __global uint8* pass2_values_vec8 = (__global uint8*)pass2_values; uint tid = get_local_id(0); uint gid = get_group_id(0); uint index = NUM_THREADS * gid + tid; uint8 a = pass2_values_vec8[index]; uint partialSum = pass2_offsets[gid]; pass2_values_vec8[index] = a + (uint8)partialSum; } -- main.cpp #pragma comment(lib, "opencl") #include <windows.h> #include <vector> #include <string> #include <fstream> #include <iostream> #include <cmath> #include <cassert> #define __CL_ENABLE_EXCEPTIONS #include <cl/cl.hpp> typedef unsigned int uint; const size_t NumLoops = 5000; const size_t NumElements = 1<< 20; template<typename T> bool DumpBuffer(cl::CommandQueue& queue, cl::Buffer& buffer, std::vector<T>& data) { size_t size; buffer.getInfo(CL_MEM_SIZE, &size); if(size % sizeof(T)) return false; data.resize(size / sizeof(T)); queue.enqueueReadBuffer(buffer, true, 0, size, &data[0]); return true; } class PrefixSumCache { public: size_t Capacity() const { return _capacity; } cl::Buffer& BaseLevel() { return _levels[0]; } private: friend class CLPrefixSum; std::vector<cl::Buffer> _levels; size_t _capacity, _valuesPerGroup; }; class CLPrefixSum { public: void CreatePrefixSumCache(size_t capacity, std::auto_ptr<PrefixSumCache>* ppCache); void Execute(cl::CommandQueue& queue, size_t numElements, PrefixSumCache* cache); private: friend void CreateCLPrefixSum(cl::Context& context, size_t valuesPerGroup, size_t threadsPerGroup, cl::Kernel& pass1, cl::Kernel& finalize, cl::Kernel& pass2, std::auto_ptr<CLPrefixSum>* ppPrefixSum); static void LevelCounts(size_t numElements, size_t valuesPerGroup, std::vector<size_t>& levelCounts); cl::Context _context; cl::Kernel _pass1, _finalize, _pass2; size_t _valuesPerGroup, _threadsPerGroup; }; void CreateCLPrefixSum(cl::Context& context, size_t valuesPerGroup, size_t threadsPerGroup, cl::Kernel& pass1, cl::Kernel& finalize, cl::Kernel& pass2, std::auto_ptr<CLPrefixSum>* ppPrefixSum) { std::auto_ptr<CLPrefixSum> prefixSum(new CLPrefixSum); prefixSum->_context = context; prefixSum->_pass1 = pass1; prefixSum->_finalize = finalize; prefixSum->_pass2 = pass2; prefixSum->_valuesPerGroup = valuesPerGroup; prefixSum->_threadsPerGroup = threadsPerGroup; *ppPrefixSum = prefixSum; } void CLPrefixSum::LevelCounts(size_t numElements, size_t valuesPerGroup, std::vector<size_t>& levelCounts) { // write the dispatch count for each pass1/pass2 level. while(numElements > valuesPerGroup) { numElements = (numElements + valuesPerGroup - 1) / valuesPerGroup; levelCounts.push_back(numElements); } } void CLPrefixSum::CreatePrefixSumCache(size_t capacity, std::auto_ptr<PrefixSumCache>* ppCache) { std::auto_ptr<PrefixSumCache> cache(new PrefixSumCache); cache->_capacity = capacity; cache->_valuesPerGroup = _valuesPerGroup; std::vector<size_t> levelCounts; LevelCounts(capacity, _valuesPerGroup, levelCounts); capacity = ~(_valuesPerGroup - 1) & (capacity + _valuesPerGroup - 1); cache->_levels.push_back(cl::Buffer(_context, CL_MEM_READ_WRITE, 4 * capacity)); for(size_t i(0); i < levelCounts.size(); ++i) { size_t count = levelCounts; capacity = ~(_valuesPerGroup - 1) & (count + _valuesPerGroup - 1); cache->_levels.push_back(cl::Buffer(_context, CL_MEM_READ_WRITE, 4 * capacity)); } *ppCache = cache; } void CLPrefixSum::Execute(cl::CommandQueue& queue, size_t numElements, PrefixSumCache* cache) { assert(cache->_valuesPerGroup == _valuesPerGroup); std::vector<size_t> levelCounts; LevelCounts(numElements, _valuesPerGroup, levelCounts); for(size_t level(0); level < levelCounts.size(); ++level) { cl::KernelFunctor f(_pass1, queue, cl::NDRange(), levelCounts[level] * _threadsPerGroup, _threadsPerGroup); f(cache->_levels[level], cache->_levels[level + 1]); } cl::KernelFunctor f(_finalize, queue, cl::NDRange(), _threadsPerGroup, _threadsPerGroup); f(cache->_levels.back()); for(int level(levelCounts.size() - 1); level >= 0; --level) { cl::KernelFunctor f(_pass2, queue, cl::NDRange(), levelCounts[level] * _threadsPerGroup, _threadsPerGroup); f(cache->_levels[level], cache->_levels[level + 1]); } } int main(int argc, char** argv) { // read in the file std::ifstream f("scan.cl"); std::string s(std::istreambuf_iterator<char>(f), std::istreambuf_iterator<char>(0)); std::vector<cl::Platform> platforms; cl::Platform::get(&platforms); std::vector<cl::Device> devices; platforms.front().getDevices(CL_DEVICE_TYPE_GPU, &devices); cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[0](), 0 }; cl::Context context(CL_DEVICE_TYPE_GPU, props); cl::CommandQueue commandQueue(context, devices[0]); cl::Program program(context, cl::Program::Sources(1, std::make_pair(s.c_str(), s.size()))); int err = program.build(devices, "-D NUM_LEVELS=7"); if(err) { std::string buildError; program.getBuildInfo(devices[0], CL_PROGRAM_BUILD_LOG, &buildError); std::cout<< buildError; return 0; } cl::Kernel pass1(program, "PrefixSumBlock_Pass1", &err); cl::Kernel finalize(program, "PrefixSumBlock_Finalize", &err); cl::Kernel pass2(program, "PrefixSumBlock_Pass2", &err); const int NumLevels = 7; const int NumThreads = 1<< NumLevels; const int ValuesPerThread = 8; const int ValuesPerGroup = 8 * NumThreads; std::auto_ptr<CLPrefixSum> prefixSum; CreateCLPrefixSum(context, ValuesPerGroup, NumThreads, pass1, finalize, pass2, &prefixSum); std::auto_ptr<PrefixSumCache> prefixSumCache; prefixSum->CreatePrefixSumCache(NumElements, &prefixSumCache); std::vector<uint> values(NumElements, 1); commandQueue.enqueueWriteBuffer(prefixSumCache->BaseLevel(), true, 0, NumElements * 4, &values[0]); commandQueue.finish(); LARGE_INTEGER freq; LARGE_INTEGER begin, end; QueryPerformanceFrequency(&freq); double period = 1.0 / freq.QuadPart; for(int j(0); j < 10; ++j) { QueryPerformanceCounter(&begin); for(int i(0); i < NumLoops; ++i) prefixSum->Execute(commandQueue, NumElements, prefixSumCache.get()); std::vector<uint> values2; DumpBuffer(commandQueue, prefixSumCache->BaseLevel(), values2); QueryPerformanceCounter(&end); double elapsed = period * (end.QuadPart - begin.QuadPart); double velocity = NumElements * (NumLoops / elapsed); printf("GPU velocity: %1.3fM\n", velocity / 1.0e6); } }

0 Likes
25 Replies
genaganna
Journeyman III

cheapo GT120M outperforms HD5850 in prefix sum

Originally posted by: BarnacleJunior I've been wrestling with this for a long time, and my cheapo 4 SM/32 core GT 120 Mobile is outperforming my HD 5850 in a basic prefix sum in OpenCL.  I'm using the NV beta 3 GPU SDK with the 159.62 driver, and the 2.0 Stream SDK with the 9.12 hotfix driver.  I'm running Win7 x64.

 

On this benchmark, the GT120M sums 275M uints/sec and the HD5850 sums 253M uints/sec.  Please don't point out bank conflicts or weird serialization things.  The 5850 should be at least 10x faster than my GT120M on the provided code.  Also, I've benchmarked parallel code in DX11 cs_5_0, and get a throughput of 3200M uints/sec on my Radeon (and I know even that is coming in too low).  There must be something messed up with the driver or shader compiler.  I'll contribute any more code that would be useful in diagnosing these issues, as I have an application that I really need to get going.

 

Thanks,

 

.sean

 

 

 

BarnacleJunior,

            I am getting 280 uints/sec with Juniper on XP64. I am sure you should get far more than this.

0 Likes
BarnacleJunior
Journeyman III

cheapo GT120M outperforms HD5850 in prefix sum

Thanks for testing that, genaganna.  It almost seems like with that set of kernels, only one of my 18 SIMD engines is running.  Your Juniper runs at a slightly higher clockrate and that could explain why your throughput is a bit better.

To make sure it wasn't memory bandwidth I tested an extremely simple kernel.. Just read from one array, increment it, and write to another.  I'm getting 116 GB/s consistent throughput, just shy of the 128 GB/s theoretical bandwidth.

I'm including that code in case anyone else wants to test theirs..

 

-- copy.cl __kernel __attribute__((reqd_work_group_size(64, 1, 1))) void IncAndCopy(__global const uint* source, __global uint* target) { __global const uint8* source_vec8 = (__global const uint8*)source; __global uint8* target_vec8 = (__global uint8*)target; uint gid = get_group_id(0); uint tid = get_local_id(0); uint offset = 64 * gid + tid; uint8 x = source_vec8[offset]; target_vec8[offset] = x + (uint8)1; } -- copy.cpp #pragma comment(lib, "opencl") #include <windows.h> #include <vector> #include <string> #include <fstream> #include <iostream> #include <cmath> #include <cassert> #define __CL_ENABLE_EXCEPTIONS #include <cl/cl.hpp> typedef unsigned int uint; const size_t NumLoops = 10000; const size_t NumElements = 1<< 24; template<typename T> bool DumpBuffer(cl::CommandQueue& queue, cl::Buffer& buffer, std::vector<T>& data) { size_t size; buffer.getInfo(CL_MEM_SIZE, &size); if(size % sizeof(T)) return false; data.resize(size / sizeof(T)); queue.enqueueReadBuffer(buffer, true, 0, size, &data[0]); return true; } int main(int argc, char** argv) { // read in the file std::ifstream f("copy.cl"); std::string s(std::istreambuf_iterator<char>(f), std::istreambuf_iterator<char>(0)); std::vector<cl::Platform> platforms; cl::Platform::get(&platforms); std::vector<cl::Device> devices; platforms.front().getDevices(CL_DEVICE_TYPE_GPU, &devices); cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[0](), 0 }; cl::Context context(CL_DEVICE_TYPE_GPU, props); cl::CommandQueue commandQueue(context, devices[0]); cl::Program program(context, cl::Program::Sources(1, std::make_pair(s.c_str(), s.size()))); int err = program.build(devices); if(err) { std::string buildError; program.getBuildInfo(devices[0], CL_PROGRAM_BUILD_LOG, &buildError); std::cout<< buildError; return 0; } cl::Kernel k(program, "IncAndCopy", &err); const size_t ValuesPerThread = 8; const size_t ThreadsPerGroup = 64; const size_t ValuesPerGroup = ValuesPerThread * ThreadsPerGroup; std::vector<uint> values1(NumElements, 0), values2(NumElements); cl::Buffer buffer1(context, CL_MEM_READ_WRITE, 4 * NumElements); cl::Buffer buffer2(context, CL_MEM_READ_WRITE, 4 * NumElements); commandQueue.enqueueWriteBuffer(buffer1, true, 0, NumElements * 4, &values1[0]); commandQueue.finish(); LARGE_INTEGER freq, begin, end; QueryPerformanceFrequency(&freq); double period = 1.0 / freq.QuadPart; for(int j(0); j < 10; ++j) { QueryPerformanceCounter(&begin); for(int i(0); i < NumLoops; i += 2) { cl::KernelFunctor f(k, commandQueue, cl::NDRange(), NumElements / ValuesPerGroup, ThreadsPerGroup); f(buffer1, buffer2); f(buffer2, buffer1); } std::vector<uint> result; DumpBuffer(commandQueue, buffer1, result); QueryPerformanceCounter(&end); double elapsed = period * (end.QuadPart - begin.QuadPart); double velocity = NumElements * (NumLoops / elapsed); printf("GPU velocity: %1.3fGB/s\n", velocity * 4 / (1<< 30)); } }

0 Likes
BarnacleJunior
Journeyman III

cheapo GT120M outperforms HD5850 in prefix sum

Stupid me!  For the bandwidth test I accidentally issued for the global work size the number of groups, not the number of threads.. So I was overstating the memory bandwidth by a factor of 64!

The kernel call should look like
     cl::KernelFunctor f(k, commandQueue, cl::NDRange(), NumElements / ValuesPerThread,
           ThreadsPerGroup);
f(buffer1, buffer2);
f(buffer2, buffer1);

With the correct parameters it's only reporting 2GB/s bandwidth.  Why is it so ludicrously slow?  Anyone else willing to try (also set NumLoops=1000 and NumElements = 1<<21 or it takes forever to run).

0 Likes
genaganna
Journeyman III

cheapo GT120M outperforms HD5850 in prefix sum

BarnacleJunior,

              Looks like you are facing problem with SIMD's.

              Could you please run SDK samples and send performance results?

              Run  BinomialOption.exe and with following options and result

                     -x 1048576 -i 10 -q -t

0 Likes
BarnacleJunior
Journeyman III

cheapo GT120M outperforms HD5850 in prefix sum

C:\codelibs\atistream\samples\opencl\bin\x86>BinomialOption.exe -x 1048576 -i 10 -q -t
Executing kernel for 10 iterations
-------------------------------------------
Option Samples           Time(sec)                KernelTime(sec)          Options/sec

1048576                  8.15981                  7.77975                  128505

0 Likes
genaganna
Journeyman III

cheapo GT120M outperforms HD5850 in prefix sum

Originally posted by: BarnacleJunior C:\codelibs\atistream\samples\opencl\bin\x86>BinomialOption.exe -x 1048576 -i 10 -q -t Executing kernel for 10 iterations ------------------------------------------- Option Samples           Time(sec)                KernelTime(sec)          Options/sec 1048576                  8.15981                  7.77975                  128505

 

 

It looks like your SIMD's working fine. This is what i am getting on Juniper.

Executing kernel for 10 iterations
-------------------------------------------
Option Samples           Time(sec)                KernelTime(sec)          Options/sec
1048576                  13.575                   11.9471                  77243

0 Likes
MicahVillmow
Staff
Staff

cheapo GT120M outperforms HD5850 in prefix sum

BarnacleJunior,
What is your group size and your local size?
0 Likes
BarnacleJunior
Journeyman III

cheapo GT120M outperforms HD5850 in prefix sum

Local size is 64 threads and global size is (for 1<< 23 elements in a memory bandwidth test) 1<<20 threads (each thread writes 8 values).

cl::KernelFunctor f(k, commandQueue, cl::NDRange(), NumElements / ValuesPerThread, ThreadsPerGroup);

0 Likes
MicahVillmow
Staff
Staff

cheapo GT120M outperforms HD5850 in prefix sum

This is what I get on a HD5870:
GPU velocity: 2564.680M
GPU velocity: 2632.006M
GPU velocity: 2624.121M
GPU velocity: 2611.988M
GPU velocity: 2613.841M
GPU velocity: 2622.924M
GPU velocity: 2624.601M
GPU velocity: 2608.794M
GPU velocity: 2618.155M
GPU velocity: 2609.728M

After using atomics to get rid of the flow control inside the loops I get:
GPU velocity: 3003.810M
GPU velocity: 3093.773M
GPU velocity: 3075.038M
GPU velocity: 3074.808M
GPU velocity: 3078.235M
GPU velocity: 3059.833M
GPU velocity: 3066.295M
GPU velocity: 3066.350M
GPU velocity: 3012.033M
GPU velocity: 3002.686M

A nice 15% improvement.

Once this is done, the ISA looks very nice except that you are now bound by your use of barriers.

#ifdef USE_ATOMIC_UPDATES #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable #endif #ifndef NUM_LEVELS #define NUM_LEVELS 7 #endif #define NUM_THREADS (1<< NUM_LEVELS) // 32 channels means shift five and add #define BANK_ADDRESS(i) (i + (i>> 5)) #define LOCAL_SIZE (BANK_ADDRESS(NUM_THREADS)) void ThreadSum(uint tid, __local uint sharedSum[LOCAL_SIZE]) { uint tid2 = BANK_ADDRESS(tid); for(uint d = 0; d < NUM_LEVELS - 1; ++d) { barrier(CLK_LOCAL_MEM_FENCE); uint mask = (2<< d) - 1; uint offset = 1<< d; #ifndef USE_ATOMIC_UPDATES if(mask == (mask & tid)) sharedSum[tid2] += sharedSum[BANK_ADDRESS(tid - offset)]; #else bool cond = (mask == (mask & tid)); uint ssum = sharedSum[BANK_ADDRESS(tid - offset)]; uint ss2 = sharedSum[tid2]; uint ssp = ss2 + ssum; uint val = select((uint)0, ssp, cond); atom_add(sharedSum + tid2, val); #endif } barrier(CLK_LOCAL_MEM_FENCE); #ifndef USE_ATOMIC_UPDATES if(0 == tid) { uint ai = BANK_ADDRESS(NUM_THREADS / 2 - 1); uint bi = BANK_ADDRESS(NUM_THREADS - 1); uint at = sharedSum[ai]; sharedSum[ai] += sharedSum[bi]; sharedSum[bi] += at + at; } #else uint ai = BANK_ADDRESS(NUM_THREADS / 2 - 1); uint bi = BANK_ADDRESS(NUM_THREADS - 1); uint at = sharedSum[ai]; uint bt = sharedSum[bt]; uint b = (0 == tid); uint atpbt = at + bt; uint at2pbt = at + at + bt; uint at2 = select((uint)0, atpbt, b); uint bt2 = select((uint)0, at2pbt, b); atom_add(sharedSum + ai, at2); atom_add(sharedSum + bi, bt2); #endif for(uint d = NUM_LEVELS - 1; d; --d) { barrier(CLK_LOCAL_MEM_FENCE); uint mask = (1<< d) - 1; uint offset = 1<< (d - 1); #ifndef USE_ATOMIC_UPDATES if(mask == (mask & tid)) { uint t = sharedSum[tid2]; uint r = BANK_ADDRESS(tid - offset); sharedSum[tid2] += sharedSum; sharedSum = t; } #else uint t = sharedSum[tid2]; uint r = BANK_ADDRESS(tid - offset); uint rdata = sharedSum; uint tpr = t + rdata; uint cond = (mask == (mask & tid)); uint ss2 = select((uint)0, tpr, cond); uint sr = select((uint)0, t, cond); atom_add(sharedSum + tid2, ss2); atom_add(sharedSum + r, sr); #endif } barrier(CLK_LOCAL_MEM_FENCE); } uint4 Inclusive4Sum(uint4 vec) { vec.y += vec.x; vec.z += vec.y; vec.w += vec.z; return vec; } uint8 Inclusive8Sum(uint8 vec) { uint8 result; result.lo = Inclusive4Sum(vec.lo); result.hi = Inclusive4Sum(vec.hi) + result.lo.w; return result; } __kernel __attribute__((reqd_work_group_size(NUM_THREADS, 1, 1))) void PrefixSumBlock_Pass1( __global uint* pass1_values, __global uint* pass1_partialSums) { __local uint sharedSum[LOCAL_SIZE]; global uint8* pass1_values_vec8 = (__global uint8*)pass1_values; uint tid = get_local_id(0); uint gid = get_group_id(0); uint index = NUM_THREADS * gid + tid; uint8 a = pass1_values_vec8[index]; uint8 aInc = Inclusive8Sum(a); uint tid2 = BANK_ADDRESS(tid); sharedSum[tid2] = aInc.s7; ThreadSum(tid, sharedSum); uint total = sharedSum[BANK_ADDRESS(0)]; uint aExc = sharedSum[tid2] - total; uint8 sum = aInc - a + aExc; pass1_values_vec8[index] = sum; if(0 == tid) pass1_partialSums[gid] = total; } /////////////////////////////////////////////////////////////////////////////////////////////////// // Finalize __kernel __attribute__((reqd_work_group_size(NUM_THREADS, 1, 1))) void PrefixSumBlock_Finalize( __global uint* finalize_values) { __local uint sharedSum[LOCAL_SIZE]; __global uint8* finalize_values_vec8 = (__global uint8*)finalize_values; uint tid = get_local_id(0); uint gid = get_group_id(0); uint index = NUM_THREADS * gid + tid; uint8 a = finalize_values_vec8[index]; uint8 aInc = Inclusive8Sum(a); uint tid2 = BANK_ADDRESS(tid); sharedSum[tid2] = aInc.s7; ThreadSum(tid, sharedSum); uint total = sharedSum[BANK_ADDRESS(0)]; uint exc = sharedSum[tid2] - total; finalize_values_vec8[index] = aInc - a + (uint8)exc; } /////////////////////////////////////////////////////////////////////////////////////////////////// // Pass 2 __kernel __attribute__((reqd_work_group_size(NUM_THREADS, 1, 1))) void PrefixSumBlock_Pass2( __global uint* pass2_values, __global const uint* pass2_offsets) { __global uint8* pass2_values_vec8 = (__global uint8*)pass2_values; uint tid = get_local_id(0); uint gid = get_group_id(0); uint index = NUM_THREADS * gid + tid; uint8 a = pass2_values_vec8[index]; uint partialSum = pass2_offsets[gid]; pass2_values_vec8[index] = a + (uint8)partialSum; }

0 Likes