zpdixon

Measuring HD 4850 performance

Discussion created by zpdixon on Jan 15, 2009
Latest reply on Jan 16, 2009 by zpdixon

I wrote the following IL kernel to benchmark the MAD instruction on my HD 4850. It's a simple loop of 0x20000 iterations over 120 MAD instructions working on registers only. When disassembled to R700 asm, I can see it is translated to 480 MULADD instructions using the 5 SPUs (X, Y, Z, W, T). Anyway even assuming the T SPU is not used, it should be capable of excuting at least 1 MAD (4 MULADD) per clock, right ? The HD 4850 is clocked at 625 MHz so the loop should execute in maximum 120*0x20000/625e6 = 0.025 sec. However on my system I measure almost 10 times that number: 0.220 sec. I am using the SDK 1.3-beta on Linux x86-64. I confirm I am measuring the time correctly, it's not a question of some overhead because if I execute 10 times more instructions, the kernel takes exactly 10 times longer to complete (2.2 sec). What could be the reason of realizing only 1/10th the theoretical perf of the HD 4850 ?

 

il_ps
dcl_output o0
dcl_literal l0, 0x0, 0x20000, 0xffffffff, 0x0

mov r0.x, l0.y ; counter
mov r1.x, l0.x ; total

ixor r2, r2, r2
ixor r3, r3, r3
ixor r4, r4, r4

whileloop
break_logicalz r0

mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4
mad r2, r2, r2, r2
mad r3, r3, r3, r3
mad r4, r4, r4, r4

iadd r0.x, r0.x, l0.z ; counter--
endloop

iadd r1, r1, r2
iadd r1, r1, r3
iadd r1, r1, r4
mov o0, r1
end

 

Outcomes