Archives Discussions

ryta1203 · ‎02-05-2009

I have two very simple kernels:

kernel void step1(float4 a<>, float4 b<>, out float c<>, out float d<>)
{
c = a.x + a.y + a.z + a.w;
d = b.x + b.y + b.z + b.w;
}

kernel void step2(float4 a<>, float4 b<>, out float4 e<>)
{
e.x = a.x + a.y + a.z + a.w;
e.y = b.x + b.y + b.z + b.w;
}

the size of all the streams are the same, lets say 2048. I iterate over the kernels 2048 times (just to get longer timing results for the GPU).

My question is this:

Why does the first kernel run significantly faster than the second kernel?

Looking at the KSA, the GPR is lower for the 2nd kernel and the ALU:Fetch is 1.25 for the 2nd kernel and 2.5 for the 1st kernel. Since the GPR is higher for the 1st kernel than the wavefronts in the run queue are going to be less. The KSA says the throughput should be higher for the 2nd kernel along with threads/clock (which is another reason the KSA "measurables" don't speak much about performance).

My only guess would be that the higher ALU:Fetch ratio is allowing latency hiding across wavefronts in the GPU, is that an accurate statement?

MicahVillmow · ‎02-05-2009

Ryta,
That could be the case, but also you might be having issues with it doing two writes to e instead of one. try copying the data to a float4 before writing to e and see if that improves performance. KSA is correct when it states that the second one should have higher throughput, but there might be other issues causing it go slower. I would actually expect the step2 kernel to run close to full speed and step1 to run at 1/2-3/4 of step2 because it has two output writes compared to one.

ryta1203 · ‎02-05-2009

kernel void step2(float4 a<>, float4 b<>, out float4 e<>)
{
float4 temp;
temp.x = a.x + a.y + a.z + a.w;
temp.y = b.x + b.y + b.z + b.w;
e=temp
}

This did not increase the performance.

MicahVillmow · ‎02-05-2009

Can you post your IL kernels here for those two kernels so I can see what is really going on?

ryta1203 · ‎02-05-2009

il_ps_2_0
; l0 = (0.000000f 0.000000f 0.000000f 0.000000f)
dcl_literal l0, 0x00000000, 0x00000000, 0x00000000, 0x00000000
; l1 = (0.000000f 0.000000f 0.000000f 0.000000f)
dcl_literal l1, 0x00000001, 0x00000001, 0x00000001, 0x00000001
; l2 = (-1.#QNAN0f -1.#QNAN0f -1.#QNAN0f -1.#QNAN0f)
dcl_literal l2, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
; l3 = (1.#QNAN0f 1.#QNAN0f 1.#QNAN0f 1.#QNAN0f)
dcl_literal l3, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF
; l4 = (1.#INF00f 1.#INF00f 1.#INF00f 1.#INF00f)
dcl_literal l4, 0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000
; l5 = (0.000000f 0.000000f 0.000000f 0.000000f)
dcl_literal l5, 0x80000000, 0x80000000, 0x80000000, 0x80000000
; l6 = (0.301030f 0.301030f 0.301030f 0.301030f)
dcl_literal l6, 0x3E9A209B, 0x3E9A209B, 0x3E9A209B, 0x3E9A209B
; l7 = (0.693147f 0.693147f 0.693147f 0.693147f)
dcl_literal l7, 0x3F317218, 0x3F317218, 0x3F317218, 0x3F317218
; l8 = (3.141593f 3.141593f 3.141593f 3.141593f)
dcl_literal l8, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB
; l9 = (1.570796f 1.570796f 1.570796f 1.570796f)
dcl_literal l9, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB
; l10 = (0.000000f 0.000000f 0.000000f 0.000000f)
dcl_literal l10, 0x00000003, 0x00000003, 0x00000003, 0x00000003
; l11 = (0.000000f 0.000000f 0.000000f 0.000000f)
dcl_literal l11, 0x00000002, 0x00000002, 0x00000002, 0x00000002
dcl_output_generic o0
dcl_output_generic o1
dcl_resource_id(0)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float)
dcl_resource_id(1)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float)
dcl_input_position_interp(linear_noperspective) v0.xy__
mov r281.xy__, v0
call 38
call 0
endmain
func 0
mov o0, r279
mov o1, r280
ret
func 11
ieq r0.x___, r62.x000, l0.x000
if_logicalnz r0.x000
sample_resource(0)_sampler(0) r64, r63.xy00
endif
ieq r0.x___, r62.x000, l1.x000
if_logicalnz r0.x000
sample_resource(1)_sampler(0) r64, r63.xy00
endif
mov r61, r64
ret_dyn
ret
func 37
mov r294.x___, r269.y000
add r273.x___, r269.x000, r294.x000
mov r295.x___, r269.z000
add r274.x___, r273.x000, r295.x000
mov r296.x___, r269.w000
add r275.x___, r274.x000, r296.x000
mov r271.x___, r275.x000
mov r297.x___, r270.y000
add r276.x___, r270.x000, r297.x000
mov r298.x___, r270.z000
add r277.x___, r276.x000, r298.x000
mov r299.x___, r270.w000
add r278.x___, r277.x000, r299.x000
mov r272.x___, r278.x000
ret
func 38
mov r288.xy__, r281.xy00
mov r62.x___, l0.x000
mov r63.xy__, r288.xy00
call 11
mov r300, r61
mov r284, r300
mov r289.xy__, r281.xy00
mov r62.x___, l1.x000
mov r63.xy__, r289.xy00
call 11
mov r301, r61
mov r285, r301
mov r269, r284
mov r270, r285
call 37
mov r286.x___, r271.x000
mov r287.x___, r272.x000
mov r290.x___, r286.x000
mov r290._y__, l0.0x00
mov r290.__z_, l0.00x0
mov r290.___w, l0.000x
mov r282, r290
mov r291.x___, r287.x000
mov r291._y__, l0.0x00
mov r291.__z_, l0.00x0
mov r291.___w, l0.000x
mov r283, r291
mov r279, r282
mov r280, r283
ret_dyn
ret
end

il_ps_2_0
; l0 = (0.000000f 0.000000f 0.000000f 0.000000f)
dcl_literal l0, 0x00000000, 0x00000000, 0x00000000, 0x00000000
; l1 = (0.000000f 0.000000f 0.000000f 0.000000f)
dcl_literal l1, 0x00000001, 0x00000001, 0x00000001, 0x00000001
; l2 = (-1.#QNAN0f -1.#QNAN0f -1.#QNAN0f -1.#QNAN0f)
dcl_literal l2, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
; l3 = (1.#QNAN0f 1.#QNAN0f 1.#QNAN0f 1.#QNAN0f)
dcl_literal l3, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF
; l4 = (1.#INF00f 1.#INF00f 1.#INF00f 1.#INF00f)
dcl_literal l4, 0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000
; l5 = (0.000000f 0.000000f 0.000000f 0.000000f)
dcl_literal l5, 0x80000000, 0x80000000, 0x80000000, 0x80000000
; l6 = (0.301030f 0.301030f 0.301030f 0.301030f)
dcl_literal l6, 0x3E9A209B, 0x3E9A209B, 0x3E9A209B, 0x3E9A209B
; l7 = (0.693147f 0.693147f 0.693147f 0.693147f)
dcl_literal l7, 0x3F317218, 0x3F317218, 0x3F317218, 0x3F317218
; l8 = (3.141593f 3.141593f 3.141593f 3.141593f)
dcl_literal l8, 0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB
; l9 = (1.570796f 1.570796f 1.570796f 1.570796f)
dcl_literal l9, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB
; l10 = (0.000000f 0.000000f 0.000000f 0.000000f)
dcl_literal l10, 0x00000003, 0x00000003, 0x00000003, 0x00000003
; l11 = (0.000000f 0.000000f 0.000000f 0.000000f)
dcl_literal l11, 0x00000002, 0x00000002, 0x00000002, 0x00000002
dcl_output_generic o0
dcl_resource_id(0)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float)
dcl_resource_id(1)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float)
dcl_input_position_interp(linear_noperspective) v0.xy__
mov r280.xy__, v0
call 38
call 0
endmain
func 0
mov o0, r279
ret
func 11
ieq r0.x___, r62.x000, l0.x000
if_logicalnz r0.x000
sample_resource(0)_sampler(0) r64, r63.xy00
endif
ieq r0.x___, r62.x000, l1.x000
if_logicalnz r0.x000
sample_resource(1)_sampler(0) r64, r63.xy00
endif
mov r61, r64
ret_dyn
ret
func 37
mov r288.x___, r269.y000
add r273.x___, r269.x000, r288.x000
mov r289.x___, r269.z000
add r274.x___, r273.x000, r289.x000
mov r290.x___, r269.w000
add r275.x___, r274.x000, r290.x000
mov r272.x___, r275.x000
mov r291.x___, r270.y000
add r276.x___, r270.x000, r291.x000
mov r292.x___, r270.z000
add r277.x___, r276.x000, r292.x000
mov r293.x___, r270.w000
add r278.x___, r277.x000, r293.x000
mov r272._y__, r278.0x00
mov r271, r272
ret
func 38
mov r285.xy__, r280.xy00
mov r62.x___, l0.x000
mov r63.xy__, r285.xy00
call 11
mov r294, r61
mov r282, r294
mov r286.xy__, r280.xy00
mov r62.x___, l1.x000
mov r63.xy__, r286.xy00
call 11
mov r295, r61
mov r283, r295
mov r269, r282
mov r270, r283
call 37
mov r284, r271
mov r281, r284
mov r279, r281
ret_dyn
ret
end

respectively.

MicahVillmow · ‎02-05-2009

If you look at the resulting ISA, you can see that there is a BURSTCNT(1) on the first kernel. This allows the kernel to push the data out to the memory at a higher rate than a single write which the second kernel does. This might be the reason why.

ryta1203 · ‎02-05-2009

Micah,

So why does the first kernel have this and the second one does not?

MicahVillmow · ‎02-05-2009

The first kernel has two outputs and the compiler is combining the outputs to a single write of twice the length but the second one only has a single write, so it is impossible to burst. Also the extra ALU might be helping allowing more of the latency to be hidden.

ryta1203 · ‎02-06-2009

Micah,

Ok, so I now have these two kernels:

kernel void step1(float4 a<>, float4 b<>, out float c<>, out float d<>)
{
c = a.x + a.y + a.z + a.w;
d = b.x + b.y + b.z + b.w;
}

kernel void step2(float4 a<>, float4 b<>, out float4 out1<>, out float4 out2<>)
{
//float4 temp;
out1.x = a.x + a.y + a.z + a.w;
out2.x = b.x + b.y + b.z + b.w;
//out1 = temp;
//out2 = temp;
}

I have tried step2 both way, using temp as in intermediate and not (shown above). They both have a Burst Count of 1 according to the ISA in KSA. They both have all of the exact same attributes according to the KSA, same ALU:Fetch, same GPR, same Avg Cycles, Threads/Clock, Throughput, CF, ALU, TEX, etc, etc...

The 1st kernel runs almost twice as fast (6.5s) as the second kernel (11.5). The ISAs have the same number of cycles count.

My only guess is that this is a memory issue, and I'm trying a little harder to understand the memory model in these GPUs.

ryta1203 · ‎02-09-2009

So writing out to a float is faster than writing out to a float4?

This seems to contradict the AMD docs. Anyone have any idea about this?

MicahVillmow · ‎02-09-2009

Ryta,
Writing out the float should be faster overall because less data is being written to the memory. However, on a per-element basis, the second kernel writes out data faster. It is writing out 4x the amount of data but is only half the speed, so that gives that you are writing out twice as fast. This is drastically over simplifying it, but explains why you are seeing the discrepancies.

ryta1203 · ‎02-09-2009

Micah,

Thanks again. This helps. In case anyone is wondering or reading this thread, I ran these two kernels:

kernel void step1(float4 a<>, float4 b<>, out float f1<>, out float f2<>, out float f3<>, out float f4<>,
out float f5<>, out float f6<>, out float f7<>, out float f8<>)
{
f1 = a.x + a.y + a.z + a.w;
f2 = b.x + b.y + b.z + b.w;
f3 = a.x - a.y - a.z - a.w;
f4 = b.x - b.y - b.z - b.w;
f5 = a.x + a.y - a.z - a.w;
f6 = b.x + b.y - b.z - b.w;
f7 = a.x - a.y + a.z + a.w;
f8 = b.x - b.y + b.z + b.w;
}

kernel void step2(float4 a<>, float4 b<>, out float4 out1<>, out float4 out2<>)
{
out1.x = a.x + a.y + a.z + a.w;
out1.y = b.x + b.y + b.z + b.w;
out1.z = a.x - a.y - a.z - a.w;
out1.w = b.x - b.y - b.z - b.w;
out1.x = a.x + a.y - a.z - a.w;
out1.y = b.x + b.y + b.z + b.w;
out1.z = a.x - a.y + a.z + a.w;
out1.w = b.x - b.y + b.z + b.w;
}

Both writing out the same equations to EIGHT outputs. The 2nd kernel was much faster, but a ratio of 20s:11s (1st:2nd). I did not include any streamWrites in my timing.

EDIT: Although, I think some of this could be attributed to the number of wavefronts in the run queue, since the 1st kernel GPR is 17 and the second is 4. IS THAT AN ACCURATE STATEMENT?

ryta1203 · ‎02-09-2009

Sorry, the 2nd kernel should be:

kernel void step2(float4 a<>, float4 b<>, out float4 out1<>, out float4 out2<>)
{
out1.x = a.x + a.y + a.z + a.w;
out1.y = b.x + b.y + b.z + b.w;
out1.z = a.x - a.y - a.z - a.w;
out1.w = b.x - b.y - b.z - b.w;
out2.x = a.x + a.y - a.z - a.w;
out2.y = b.x + b.y + b.z + b.w;
out2.z = a.x - a.y + a.z + a.w;
out2.w = b.x - b.y + b.z + b.w;
}

My first post were all out1 = , this was a typo, my apologies.

Archives Discussions

Another Performance Question