cancel
Showing results for 
Search instead for 
Did you mean: 

Archives Discussions

Raistmer
Adept II

Brook+ to IL translation - why so big differencies?

small modification in Brook+ kernel and big changes in IL code

2 kernels differ only by one additional float4 value load and one add operation.
But their IL translations differ much more.
Why so many oprations were added ?
Code:

Kernel1: kernel void GPU_coadd_kernel6_size4_5_o2(float4 src[][],out float4 dest[][]) {//R: size in function name is size of input array //R:will perform last coadd level int threadID=instance().y; float4 i1; float4 o11; i1=src[threadID][0]; o11.xy=i1.xz+i1.yw; dest[threadID][0]=o11; } kernel2: kernel void GPU_coadd_kernel6_size6_7_o3(float4 src[][],out float4 dest[][]) {//R: size in function name is size of input array //R:will perform last coadd level int threadID=instance().y; float4 i1,i2; float4 o11; i1=src[threadID][0]; i2=src[threadID][1]; o11.xy=i1.xz+i1.yw; o11.z=i2.x+i2.y; dest[threadID][0]=o11; } IL for kernel 1: const char __GPU_coadd_kernel6_size4_5_o2_cal_desc_tech0_pass0[] = "il_ps_2_0\n" "dcl_literal l0,0x00000000,0x00000000,0x00000000,0x00000000\n" "dcl_literal l1,0x00000001,0x00000001,0x00000001,0x00000001\n" "dcl_literal l2,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF\n" "dcl_literal l3,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF\n" "dcl_literal l4,0x7F800000,0x7F800000,0x7F800000,0x7F800000\n" "dcl_literal l5,0x80000000,0x80000000,0x80000000,0x80000000\n" "dcl_literal l6,0x3E9A209B,0x3E9A209B,0x3E9A209B,0x3E9A209B\n" "dcl_literal l7,0x3F317218,0x3F317218,0x3F317218,0x3F317218\n" "dcl_literal l8,0x40490FDB,0x40490FDB,0x40490FDB,0x40490FDB\n" "dcl_literal l9,0x3FC90FDB,0x3FC90FDB,0x3FC90FDB,0x3FC90FDB\n" "dcl_literal l10,0x00000003,0x00000003,0x00000003,0x00000003\n" "dcl_literal l11,0x00000002,0x00000002,0x00000002,0x00000002\n" ";global (g) declared, size = 4096\n" "dcl_literal l12,0x00000000,0x00000000,0x00000000,0x00000000\n" "dcl_literal l13,0x00000001,0x00000001,0x00000001,0x00000001\n" "dcl_literal l14,0x00000000,0x3FE00000,0x00000000,0x3FE00000\n" "dcl_literal l15,0x3F800000,0x3F800000,0x3F800000,0x3F800000\n" "dcl_literal l16,0x00000002,0x00000002,0x00000002,0x00000002\n" "dcl_literal l17,0x00000002,0x00000002,0x00000002,0x00000002\n" "dcl_literal l18,0x00000003,0x00000003,0x00000003,0x00000003\n" "dcl_literal l19,0x00000001,0x00000001,0x00000001,0x00000001\n" "dcl_literal l20,0x00000004,0x00000004,0x00000004,0x00000004\n" "dcl_literal l21,0x00000003,0x00000003,0x00000003,0x00000003\n" "dcl_resource_id(0)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float)\n" "dcl_input_position_interp(linear_noperspective) v0.xy__\n" "dcl_literal l22,0x3F000000,0x3F000000,0x3F000000,0x3F000000\n" "dcl_literal l23,0x3F000000,0x3F000000,0x3F000000,0x3F000000\n" "dcl_cb cb0[1]\n" "mov r454.xy__,v0.xyzw\n" "mov r269.x___,cb0[l0.x + 0].x000\n" "mov r270.xyz_,cb0[l0.x + 0].yzw0\n" "call 49 \n" "call 0 \n" "endmain\n" "\n" "func 0\n" "ret\n" "\n" "func 2\n" "ieq r0.x___,r17.x000,l0.x000\n" "if_logicalnz r0.x000\n" "sample_resource(0)_sampler(0) r19.xyzw,r18.xy00\n" "endif\n" "mov r16.x___,r19.x000\n" "ret_dyn\n" "ret\n" "\n" "func 11\n" "ieq r0.x___,r62.x000,l0.x000\n" "if_logicalnz r0.x000\n" "sample_resource(0)_sampler(0) r64.xyzw,r63.xy00\n" "endif\n" "mov r61.xyzw,r64.xyzw\n" "ret_dyn\n" "ret\n" "\n" "func 14\n" "ieq r0.x___,r80.x000,l0.x000\n" "if_logicalnz r0.x000\n" "sample_resource(0)_sampler(0) r82.xyzw,r81.xy00\n" "endif\n" "mov r83.xy__,r82.xy00\n" "mov r84.xy__,r82.xy00\n" "mov r79.xy__,r83.xy00\n" "ret_dyn\n" "ret\n" "\n" "func 20\n" "ieq r0.x___,r144.x000,l0.x000\n" "if_logicalnz r0.x000\n" "sample_resource(0)_sampler(0) r146.xyzw,r145.xy00\n" "endif\n" "mov r147.x___,r146.x000\n" "mov r143.x___,r147.x000\n" "ret_dyn\n" "ret\n" "\n" "func 36\n" "mov r458.x___,r261.z000\n" "imul r263.x___,r458.x000,r262.x000\n" "mov r459.x___,r262.y000\n" "imul r264.x___,r263.x000,r459.x000\n" "mov r460.x___,r261.y000\n" "imul r265.x___,r460.x000,r262.x000\n" "iadd r266.x___,r264.x000,r265.x000\n" "iadd r267.x___,r266.x000,r261.x000\n" "mov r260.x___,r267.x000\n" "ret_dyn\n" "ret\n" "\n" "func 46\n" "mov r461.x___,r366.y000\n" "mov r367.x___,r461.x000\n" "itof r462.x___,l12.x000\n" "mov r370.x___,r462.x000\n" "itof r463.x___,r367.x000\n" "mov r370._y__,r463.0x00\n" "mov r62.x___,r365.x000\n" "mov r63.xy__,r370.xy00\n" "call 11 \n" "mov r464.xyzw,r61.xyzw\n" "mov r368.xyzw,r464.xyzw\n" "mov r465.xy__,r368.xz00\n" "mov r466.xy__,r368.yw00\n" "add r371.xy__,r465.xy00,r466.xy00\n" "mov r369.xy__,r371.xy00\n" "mov r372.x___,l12.x000\n" "mov r372._y__,r367.0x00\n" "mov r467.xyzw,r372.xy00\n" "mov r261.xyzw,r467.xyzw\n" "mov r468.xyzw,r269.xxxx\n" "mov r262.xyzw,r468.xyzw\n" "call 36 \n" "mov r469.x___,r260.x000\n" "umul r373.x___,l19.x000,r469.x000\n" "iadd r373.x___,r373.x000,l0.x000\n" "mov g[r373.x + 0].xyzw,r369.xyzw\n" "ret\n" "\n" "func 49\n" "mov r456.x___,l22.x000\n" "mov r456._y__,l23.0x00\n" "sub r457.xy__,r454.xy00,r456.xy00\n" "mov r470.xyzw,r457.xy00\n" "mov r455.xyzw,r470.xyzw\n" "mov r365.x___,l0.x000\n" "ftoi r471.xyzw,r455.xyzw\n" "mov r366.xyzw,r471.xyzw\n" "call 46 \n" "ret\n" "\n" "end\n" ""; IL for kernel2: const char __GPU_coadd_kernel6_size6_7_o3_cal_desc_tech0_pass0[] = "il_ps_2_0\n" "dcl_literal l0,0x00000000,0x00000000,0x00000000,0x00000000\n" "dcl_literal l1,0x00000001,0x00000001,0x00000001,0x00000001\n" "dcl_literal l2,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF\n" "dcl_literal l3,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF\n" "dcl_literal l4,0x7F800000,0x7F800000,0x7F800000,0x7F800000\n" "dcl_literal l5,0x80000000,0x80000000,0x80000000,0x80000000\n" "dcl_literal l6,0x3E9A209B,0x3E9A209B,0x3E9A209B,0x3E9A209B\n" "dcl_literal l7,0x3F317218,0x3F317218,0x3F317218,0x3F317218\n" "dcl_literal l8,0x40490FDB,0x40490FDB,0x40490FDB,0x40490FDB\n" "dcl_literal l9,0x3FC90FDB,0x3FC90FDB,0x3FC90FDB,0x3FC90FDB\n" "dcl_literal l10,0x00000003,0x00000003,0x00000003,0x00000003\n" "dcl_literal l11,0x00000002,0x00000002,0x00000002,0x00000002\n" ";global (g) declared, size = 4096\n" "dcl_literal l12,0x00000000,0x00000000,0x00000000,0x00000000\n" "dcl_literal l13,0x00000001,0x00000001,0x00000001,0x00000001\n" "dcl_literal l14,0x00000000,0x3FE00000,0x00000000,0x3FE00000\n" "dcl_literal l15,0x3F800000,0x3F800000,0x3F800000,0x3F800000\n" "dcl_literal l16,0x00000002,0x00000002,0x00000002,0x00000002\n" "dcl_literal l17,0x00000003,0x00000003,0x00000003,0x00000003\n" "dcl_literal l18,0x00000004,0x00000004,0x00000004,0x00000004\n" "dcl_resource_id(0)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float)\n" "dcl_input_position_interp(linear_noperspective) v0.xy__\n" "dcl_literal l19,0x3F000000,0x3F000000,0x3F000000,0x3F000000\n" "dcl_literal l20,0x3F000000,0x3F000000,0x3F000000,0x3F000000\n" "dcl_cb cb0[1]\n" "mov r457.xy__,v0.xyzw\n" "mov r269.x___,cb0[l0.x + 0].x000\n" "mov r270.xyz_,cb0[l0.x + 0].yzw0\n" "call 49 \n" "call 0 \n" "endmain\n" "\n" "func 0\n" "ret\n" "\n" "func 2\n" "ieq r0.x___,r17.x000,l0.x000\n" "if_logicalnz r0.x000\n" "sample_resource(0)_sampler(0) r19.xyzw,r18.xy00\n" "endif\n" "mov r16.x___,r19.x000\n" "ret_dyn\n" "ret\n" "\n" "func 11\n" "ieq r0.x___,r62.x000,l0.x000\n" "if_logicalnz r0.x000\n" "sample_resource(0)_sampler(0) r64.xyzw,r63.xy00\n" "endif\n" "mov r61.xyzw,r64.xyzw\n" "ret_dyn\n" "ret\n" "\n" "func 14\n" "ieq r0.x___,r80.x000,l0.x000\n" "if_logicalnz r0.x000\n" "sample_resource(0)_sampler(0) r82.xyzw,r81.xy00\n" "endif\n" "mov r83.xy__,r82.xy00\n" "mov r84.xy__,r82.xy00\n" "mov r79.xy__,r83.xy00\n" "ret_dyn\n" "ret\n" "\n" "func 20\n" "ieq r0.x___,r144.x000,l0.x000\n" "if_logicalnz r0.x000\n" "sample_resource(0)_sampler(0) r146.xyzw,r145.xy00\n" "endif\n" "mov r147.x___,r146.x000\n" "mov r143.x___,r147.x000\n" "ret_dyn\n" "ret\n" "\n" "func 36\n" "mov r461.x___,r261.z000\n" "imul r263.x___,r461.x000,r262.x000\n" "mov r462.x___,r262.y000\n" "imul r264.x___,r263.x000,r462.x000\n" "mov r463.x___,r261.y000\n" "imul r265.x___,r463.x000,r262.x000\n" "iadd r266.x___,r264.x000,r265.x000\n" "iadd r267.x___,r266.x000,r261.x000\n" "mov r260.x___,r267.x000\n" "ret_dyn\n" "ret\n" "\n" "func 46\n" "mov r464.x___,r366.y000\n" "mov r367.x___,r464.x000\n" "itof r465.x___,l12.x000\n" "mov r371.x___,r465.x000\n" "itof r466.x___,r367.x000\n" "mov r371._y__,r466.0x00\n" "mov r62.x___,r365.x000\n" "mov r63.xy__,r371.xy00\n" "call 11 \n" "mov r467.xyzw,r61.xyzw\n" "mov r369.xyzw,r467.xyzw\n" "itof r468.x___,l13.x000\n" "mov r372.x___,r468.x000\n" "itof r469.x___,r367.x000\n" "mov r372._y__,r469.0x00\n" "mov r62.x___,r365.x000\n" "mov r63.xy__,r372.xy00\n" "call 11 \n" "mov r470.xyzw,r61.xyzw\n" "mov r368.xyzw,r470.xyzw\n" "mov r471.xy__,r369.xz00\n" "mov r472.xy__,r369.yw00\n" "add r373.xy__,r471.xy00,r472.xy00\n" "mov r370.xy__,r373.xy00\n" "mov r473.x___,r368.y000\n" "add r374.x___,r368.x000,r473.x000\n" "mov r370.__z_,r374.00x0\n" "mov r375.x___,l12.x000\n" "mov r375._y__,r367.0x00\n" "mov r474.xyzw,r375.xy00\n" "mov r261.xyzw,r474.xyzw\n" "mov r475.xyzw,r269.xxxx\n" "mov r262.xyzw,r475.xyzw\n" "call 36 \n" "mov r476.x___,r260.x000\n" "umul r376.x___,l13.x000,r476.x000\n" "iadd r376.x___,r376.x000,l0.x000\n" "mov g[r376.x + 0].xyzw,r370.xyzw\n" "ret\n" "\n" "func 49\n" "mov r459.x___,l19.x000\n" "mov r459._y__,l20.0x00\n" "sub r460.xy__,r457.xy00,r459.xy00\n" "mov r477.xyzw,r460.xy00\n" "mov r458.xyzw,r477.xyzw\n" "mov r365.x___,l0.x000\n" "ftoi r478.xyzw,r458.xyzw\n" "mov r366.xyzw,r478.xyzw\n" "call 46 \n" "ret\n" "\n" "end\n" "";

0 Likes
6 Replies
the729
Journeyman III

Hi Raistmer,

I used to try to read the IL code generated by brook, and gave up in the end.

The shader compiler which compiles IL to machine codes does a heavy optimization. So do not worry about the length of the brook-generated IL, especially those useless mov instructions, which will be rewritten completely in the shader compiler.

Although brook+ sometimes not stable, sometimes buggy, and lacks useful features, trust the brcc generated IL code, which I personally have not find any bug yet.

 

0 Likes

Originally posted by: the729

Hi Raistmer,




I used to try to read the IL code generated by brook, and gave up in the end.



 


Same here

0 Likes

Raistmer,
IL is an intermediate step and the length of that can vary dramatically by the time it gets executed on the hardware. A better judge of changes would be to look at the disassembled ISA either via calclDisassemble(image/object) or by pasting the shader in Stream Kernel Analyzer and setting the output format to ISA.
0 Likes

Originally posted by: MicahVillmow

Raistmer,

IL is an intermediate step and the length of that can vary dramatically by the time it gets executed on the hardware. A better judge of changes would be to look at the disassembled ISA either via calclDisassemble(image/object) or by pasting the shader in Stream Kernel Analyzer and setting the output format to ISA.


Thanks.
In absence of good IL guides I wanted to improve IL knowledge by looking on generated IL code, but with so dramatic code changes it seems it was bad approach
0 Likes

Raistmer,
Your best bet is to study the hand written IL in the samples as they are cleaner than code generated IL. The problem comes with SSA form used internally in the compiler generates a lot of unneeded IL that gets optimized away by the lower level compiler.
0 Likes

Originally posted by: MicahVillmow

Raistmer,

Your best bet is to study the hand written IL in the samples as they are cleaner than code generated IL. The problem comes with SSA form used internally in the compiler generates a lot of unneeded IL that gets optimized away by the lower level compiler.


Thanks, will look that way.
0 Likes