See attached code.

Logically, the first version should run about as fast as the second version.

Instead I'm seeing that the first version can do 20M kernels/second and the second version can do 32M kernels/second on a 6970. This seems to be a compiler optimization bug, because the assembly code for the first version is a LOT bigger than the second version, a lot more than 4x (according to Kernel Analyzer). To be specific, the second version compiles into 76 VLIW instructions per loop, and the first version compiles into 520 instructions per loop. Hence the performance difference: 32/20 = 1.6, 520 / (76*4) ~ 1.7.

__kernel void test_kernel_1(__global long* out, __global ulong4* const d_p, ulong seed) { ulong sum = 0; seed += get_global_id(0); for(int i=0; i<128; i+=4) { ulong4 p = d_p[i]; sum += mul_hi(seed, p.x); sum += mul_hi(seed, p.y); sum += mul_hi(seed, p.z); sum += mul_hi(seed, p.w); p = d_p[i+1]; sum += mul_hi(seed, p.x); sum += mul_hi(seed, p.y); sum += mul_hi(seed, p.z); sum += mul_hi(seed, p.w); p = d_p[i+2]; sum += mul_hi(seed, p.x); sum += mul_hi(seed, p.y); sum += mul_hi(seed, p.z); sum += mul_hi(seed, p.w); p = d_p[i+3]; sum += mul_hi(seed, p.x); sum += mul_hi(seed, p.y); sum += mul_hi(seed, p.z); sum += mul_hi(seed, p.w); } out[get_global_id(0)] = sum; } __kernel void test_kernel_2(__global long* out, __global ulong4* const d_p, ulong seed) { ulong sum = 0; seed += get_global_id(0); for(int i=0; i<128; i++) { ulong4 p = d_p[i]; sum += mul_hi(seed, p.x); sum += mul_hi(seed, p.y); sum += mul_hi(seed, p.z); sum += mul_hi(seed, p.w); } out[get_global_id(0)] = sum; }

eugnek,

Shouldn't the first kernel be taking more time, it is doing more work inside the loop ?