Loop unrolling affects GPR in SKA 1.6

Discussion created by roddomi on Sep 1, 2010
Latest reply on Sep 19, 2010 by himanshu.gautam


I am using the Stream KernelAnalyzer 1.6 to profile a matrix multiplication IL shader. I am getting a suspiciously high GPR usage: 31 registers on HD5870.

After looking closely, I noticed that the problem was coming from the column-row multiplication loop being unrolled. The compiler is not reusing registers from one unrolled iteration to another which could be done since the livenesses don't interfere. It seems other people have found a similar problem:

This problem can be confirmed by deleting some of the unrolled iterations and running the SKA again. I can get the shader down to 14 registers by doing this.

I am attaching the code in case this helps. I am running SDK 2.2 (SKA 1.6) on a Windows Vista 32-bits machine without any graphics card.

Thank you.


il_cs_2_0 dcl_max_thread_per_group 512 dcl_raw_uav_id(0) dcl_arena_uav_id(1) dcl_lds_id(1) 32768 dcl_cb cb0[2] dcl_cb cb1[5] dcl_literal l2, 0, 0, 0, 0 dcl_literal l4, 1024, 1024, 1024, 1024 dcl_literal l35, 960, 960, 960, 960 dcl_literal l33, 896, 896, 896, 896 dcl_literal l31, 832, 832, 832, 832 dcl_literal l29, 768, 768, 768, 768 dcl_literal l27, 704, 704, 704, 704 dcl_literal l25, 640, 640, 640, 640 dcl_literal l23, 576, 576, 576, 576 dcl_literal l21, 512, 512, 512, 512 dcl_literal l19, 448, 448, 448, 448 dcl_literal l17, 384, 384, 384, 384 dcl_literal l15, 320, 320, 320, 320 dcl_literal l13, 256, 256, 256, 256 dcl_literal l12, 192, 192, 192, 192 dcl_literal l10, 128, 128, 128, 128 dcl_literal l8, 64, 64, 64, 64 dcl_literal l34, 60, 60, 60, 60 dcl_literal l32, 56, 56, 56, 56 dcl_literal l30, 52, 52, 52, 52 dcl_literal l28, 48, 48, 48, 48 dcl_literal l26, 44, 44, 44, 44 dcl_literal l24, 40, 40, 40, 40 dcl_literal l22, 36, 36, 36, 36 dcl_literal l20, 32, 32, 32, 32 dcl_literal l6, 31, 31, 31, 31 dcl_literal l18, 28, 28, 28, 28 dcl_literal l16, 24, 24, 24, 24 dcl_literal l14, 20, 20, 20, 20 dcl_literal l0, 16, 16, 16, 16 dcl_literal l5, 15, 15, 15, 15 dcl_literal l11, 12, 12, 12, 12 dcl_literal l9, 8, 8, 8, 8 dcl_literal l7, 4, 4, 4, 4 dcl_literal l3, 0, 0, 0, 0 dcl_literal l1, -1, -1, -1, -1 mov r0, vThreadGrpId.x imul r1, r0, l0 mov r2, vThreadGrpId.y mov r3, cb1[3] imul r4, r2, r3 imul r5, r4, l0 iadd r6, r5, r3 iadd r7, r6, l1 mov r8, vTidInGrp.x mov r9, vTidInGrp.y mov r10, cb1[4] ilt r11, r7, r5 mov r12, l2 if_logicalz r11 mov r13, l3 mov r14, l4 iadd r15, r3, l5 ishr r16, r15, l6 mov r17, l5 iand r18, r16, r17 iadd r19, r18, r15 ishr r20, r19, l7 imul r21, r9, r10 imul r22, r9, r3 mov r23, r8 mov r24, r9 iadd r25, r22, r5 iadd r26, r8, r25 umul r27, r23, l7 iadd r28, r14, r27 umul r29, r24, l8 iadd r30, r13, r29 umul r31, r24, l0 iadd r32, r23, r31 umul r33, r32, l7 iadd r34, r22, r7 imul r35, r10, l0 mov r36, r35 umul r37, r36, l7 iadd r38, r33, r13 iadd r39, r33, r14 iadd r40, r34, r8 mov r41, cb1[2] iadd r42, r21, r1 iadd r43, r8, r42 mov r44, r43 umul r45, r44, l7 iadd r46, r41, r45 mov r47, cb1[1] mov r48, r26 umul r49, r48, l7 iadd r50, r47, r49 mov r51, r20 whileloop uav_raw_load_id(0) r52, r50 lds_store_id(1) r38, r52.x uav_raw_load_id(0) r53, r46 lds_store_id(1) r39, r53.x fence_threads_lds lds_load_id(1) r54.x, r30 lds_load_id(1) r55.x, r28 mad r12, r54, r55, r12 iadd r1000, r30, l7 lds_load_id(1) r56.x, r1000 iadd r1001, r28, l8 lds_load_id(1) r57.x, r1001 mad r12, r56, r57, r12 iadd r1002, r30, l9 lds_load_id(1) r58.x, r1002 iadd r1003, r28, l10 lds_load_id(1) r59.x, r1003 mad r12, r58, r59, r12 iadd r1004, r30, l11 lds_load_id(1) r60.x, r1004 iadd r1005, r28, l12 lds_load_id(1) r61.x, r1005 mad r12, r60, r61, r12 iadd r1006, r30, l0 lds_load_id(1) r62.x, r1006 iadd r1007, r28, l13 lds_load_id(1) r63.x, r1007 mad r12, r62, r63, r12 iadd r1008, r30, l14 lds_load_id(1) r64.x, r1008 iadd r1009, r28, l15 lds_load_id(1) r65.x, r1009 mad r12, r64, r65, r12 iadd r1010, r30, l16 lds_load_id(1) r66.x, r1010 iadd r1011, r28, l17 lds_load_id(1) r67.x, r1011 mad r12, r66, r67, r12 iadd r1012, r30, l18 lds_load_id(1) r68.x, r1012 iadd r1013, r28, l19 lds_load_id(1) r69.x, r1013 mad r12, r68, r69, r12 iadd r1014, r30, l20 lds_load_id(1) r70.x, r1014 iadd r1015, r28, l21 lds_load_id(1) r71.x, r1015 mad r12, r70, r71, r12 iadd r1016, r30, l22 lds_load_id(1) r72.x, r1016 iadd r1017, r28, l23 lds_load_id(1) r73.x, r1017 mad r12, r72, r73, r12 iadd r1018, r30, l24 lds_load_id(1) r74.x, r1018 iadd r1019, r28, l25 lds_load_id(1) r75.x, r1019 mad r12, r74, r75, r12 iadd r1020, r30, l26 lds_load_id(1) r76.x, r1020 iadd r1021, r28, l27 lds_load_id(1) r77.x, r1021 mad r12, r76, r77, r12 iadd r1022, r30, l28 lds_load_id(1) r78.x, r1022 iadd r1023, r28, l29 lds_load_id(1) r79.x, r1023 mad r12, r78, r79, r12 iadd r1024, r30, l30 lds_load_id(1) r80.x, r1024 iadd r1025, r28, l31 lds_load_id(1) r81.x, r1025 mad r12, r80, r81, r12 iadd r1026, r30, l32 lds_load_id(1) r82.x, r1026 iadd r1027, r28, l33 lds_load_id(1) r83.x, r1027 mad r12, r82, r83, r12 iadd r1028, r30, l34 lds_load_id(1) r84.x, r1028 iadd r1029, r28, l35 lds_load_id(1) r85.x, r1029 mad r12, r84, r85, r12 fence_threads_lds iadd r46, r37, r46 iadd r26, r26, l0 iadd r50, r50, l8 ige r86, r40, r26 if_logicalz r86 break endif endloop else imul r21, r9, r10 endif mov r87, cb1[0] imul r88, r10, r2 iadd r89, r0, r88 imul r90, r89, l0 iadd r91, r21, r90 iadd r92, r8, r91 mov r93, r92 umul r94, r93, l7 iadd r95, r87, r94 uav_raw_store_id(0) mem.x, r95, r12 end