1 Reply Latest reply on Dec 14, 2011 9:52 AM by MicahVillmow

    Incorrect address registers gererated for GDS_WRITE compiling from IL code



      I have been using GDS memory at the IL code level in the opencl environment. Since Opencl does not yet support GDS, I'm not sure how to classify this but it seems to be purely an IL compile issue.

       The GDS works correctly most of the time using dcl_gds_id(n), gds_store() and gds_load(), but fails randomly, often writing to wrong locations. When the failure occurs, binary output from APP Kernel Analyzer (1.9) shows that the compiler is not using the correct register for the address part of the GDS_WRITE instruction, even in simple cases where both address and data are constants. It usually looks something like this although other forms occur:

      7  x: MOV         R1.x,  (0x00000020, 4.484155086e-44f).x            

          y: MOV         R0.y,  (0x55500000, 1.429365116e13f).y     

      8  GDS_WRITE R0.xy                                          

       This may occur when the compiler chooses a register that does not have both x and y components free. I am using: Win7-64, Cayman and Bart cards, AMD APP SDK 2.5 and most recently catalyst 11.11c.  I just installed 11.12 (thanks!) and find the same issue but the exact code sequences that cause it may have changed a little.

       I'm not sure where this fits in the scheme of things but I would be nice to have fixed. I have included code snippets for both IL and bin code, and marked the appropriate locations.

       Many thanks, and keep up the good work.


      ; -------- Disassembly -------------------- 00 ALU_PUSH_BEFORE: ADDR(32) CNT(18) KCACHE0(CB0:0-15) 0 x: MULLO_INT R2.x, R1.x, KC0[1].x y: MULLO_INT ____, R1.x, KC0[1].x z: MULLO_INT ____, R1.x, KC0[1].x w: MULLO_INT ____, R1.x, KC0[1].x 1 x: MULLO_INT ____, R1.y, KC0[1].y y: MULLO_INT ____, R1.y, KC0[1].y z: MULLO_INT ____, R1.y, KC0[1].y w: MULLO_INT ____, R1.y, KC0[1].y 2 y: ADD_INT ____, R0.y, PV1.w z: ADD_INT ____, R0.x, R2.x 3 x: ADD_INT R0.x, PV2.z, KC0[6].x y: ADD_INT R0.y, PV2.y, KC0[6].y z: MOV R0.z, 0.0f 4 x: LSHL ____, PV3.y, 8 5 w: ADD_INT R0.w, R0.x, PV4.x 6 x: PREDE_INT ____, R0.w, 2 UPDATE_EXEC_MASK UPDATE_PRED 01 JUMP POP_CNT(1) ADDR(5) 02 ALU: ADDR(50) CNT(3) 7 x: MOV R1.x, (0x00000020, 4.484155086e-44f).x <<-----------here------------- y: MOV R0.y, (0x55500000, 1.429365116e13f).y <<-----------here------------- 03 GDS: ADDR(96) CNT(1) 8 GDS_WRITE R0.xy <<-----------here------------- 04 POP (1) ADDR(5) 05 ALU: ADDR(53) CNT(1) 9 x: GROUP_BARRIER ____ 06 ALU: ADDR(54) CNT(1) 10 x: GROUP_BARRIER ____ 07 GDS: ADDR(98) CNT(1) 11 GDS_READ_RET R4.x___, R0.z 08 ALU: ADDR(55) CNT(2) 12 y: MOV R0.y, (0x00000020, 4.484155086e-44f).x 09 GDS: ADDR(100) CNT(1) 13 GDS_READ_RET R5.x___, R0.y 10 ALU: ADDR(57) CNT(1) 14 x: GROUP_BARRIER ____ 11 ALU: ADDR(58) CNT(18) KCACHE0(CB1:0-15) 15 x: AND_INT ____, R0.x, 0x0000000F y: LSHL ____, R0.w, 5 16 x: LSHL ____, PV15.x, 2 z: ADD_INT ____, KC0[0].x, PV15.y 17 x: LSHR R1.x, PV16.z, 2 y: ADD_INT ____, PV16.z, 16 z: ADD_INT ____, PV16.x, 4096 18 x: LDS_READ_RET QA, PV17.z y: LSHR R0.y, PV17.y, 4 w: LSHR R0.w, PV17.x, 2 19 x: ADD_INT R3.x, R1.x, 4 20 w: MOV R1.w, QA[18].pop 12 TEX: ADDR(102) CNT(2) 21 VFETCH R2, R0.w, fc175 FORMAT(32_32_32_32_FLOAT) FETCH_TYPE(NO_INDEX_OFFSET) 22 VFETCH R0, R0.y, fc175 FORMAT(32_32_32_32_FLOAT) FETCH_TYPE(NO_INDEX_OFFSET) 13 ALU: ADDR(76) CNT(2) 23 y: MOV R0.y, R5.x z: MOV R2.z, R1.w 14 MEM_RAT_CACHELESS_STORE_DWORD__NI: RAT(11)[R1], R2, ARRAY_SIZE(4) MARK VPM 15 ALU: ADDR(78) CNT(5) 24 z: MOV R0.z, 0.0f 25 x: MOV R0.x, R4.x y: MOV R0.y, R0.y z: MOV R0.z, PV24.z w: MOV R0.w, R0.w 16 MEM_RAT_CACHELESS_STORE_DWORD__NI: RAT(11)[R3], R0, ARRAY_SIZE(4) MARK VPM 17 END END_OF_PROGRAM ============================================================== il_cs_2_0 dcl_cb cb0[10] ; Constant buffer that holds ABI data dcl_literal l0, 4, 1, 2, 3 dcl_literal l1, 0x00FFFFFF, -1, -2, -3 dcl_literal l2, 0x0000FFFF, 0xFFFFFFFE,0x000000FF,0xFFFFFFFC dcl_literal l3, 24, 16, 8, 0xFFFFFFFF dcl_literal l4, 0xFFFFFF00, 0xFFFF0000, 0xFF00FFFF, 0xFFFF00FF dcl_literal l5, 0, 4, 8, 12 dcl_literal l6, 32, 32, 32, 32 dcl_literal l7, 24, 31, 16, 31 call 1024;$ endmain func 1024 ; __OpenCL_permute_kernel mov r1013, cb0[8].x mov r1019, l1.0 dcl_max_thread_per_group 256 dcl_gds_id(0) 16384 dcl_lds_id(1) 5648 dcl_raw_uav_id(11) dcl_arena_uav_id(8) mov r0.z, vThreadGrpIdFlat.x mov r1022.xyz0, vTidInGrp.xyz mov r1023.xyz0, vThreadGrpId.xyz imad r1021.xyz0, r1023.xyz, cb0[1].xyz, r1022.xyz iadd r1021.xyz0, r1021.xyz0, cb0[6].xyz0 iadd r1023.xyz0, r1023.xyz0, cb0[7].xyz0 mov r1023.w, r0.z ishl r1023.w, r1023.w, l0.z mov r1018.x, l0.0 udiv r1024.xyz, r1021.xyz, cb0[10].xyz imad r1025.xyz0, r1023.xyz, cb0[1].xyz, r1022.xyz dcl_literal l14, 0x00000000, 0x00000000, 0x00000000, 0x00000000; f32:i32 0 dcl_literal l11, 0x00000001, 0x00000001, 0x00000001, 0x00000001; f32:i32 1 dcl_literal l10, 0x00000002, 0x00000002, 0x00000002, 0x00000002; f32:i32 2 dcl_literal l17, 0x00000005, 0x00000005, 0x00000005, 0x00000005; f32:i32 5 dcl_literal l9, 0x00000008, 0x00000008, 0x00000008, 0x00000008; f32:i32 8 dcl_literal l15, 0x0000000f, 0x0000000f, 0x0000000f, 0x0000000f; f32:i32 15 dcl_literal l18, 0x00000010, 0x00000010, 0x00000010, 0x00000010; f32:i32 16 dcl_literal l12, 0x00000020, 0x00000020, 0x00000020, 0x00000020; f32:i32 32 dcl_literal l21, 0x00000200, 0x00000200, 0x00000200, 0x00000200; f32:i32 512 dcl_literal l19, 0x00000400, 0x00000400, 0x00000400, 0x00000400; f32:i32 1024 dcl_literal l16, 0x00001000, 0x00001000, 0x00001000, 0x00001000; f32:i32 4096 dcl_literal l20, 0x00001400, 0x00001400, 0x00001400, 0x00001400; f32:i32 5120 dcl_literal l22, 0x00001600, 0x00001600, 0x00001600, 0x00001600; f32:i32 5632 dcl_literal l13, 0x55500000, 0x55500000, 0x55500000, 0x55500000; f32:i32 1431306240 dcl_cb cb1[10] mov r1, cb1[0] mov r2, cb1[1] mov r3, cb1[2] mov r4, cb1[3] mov r5, cb1[4] mov r6, cb1[5] mov r7, cb1[6] mov r8, cb1[7] mov r9, cb1[8] mov r10, cb1[9] call 1030 ; permute ret endfunc ; func 1030 ; permute mov r253, r1021.xyz0 mov r254, r1021.xyz0 mov r254, r254.y000 mov r255, l9.xxxx ishl r254.x___, r254.xxxx, r255.xxxx mov r253, r253.x000 iadd r254.x___, r254.xxxx, r253.xxxx mov r255, l10.xxxx ieq r256.x___, r254.xxxx, r255.xxxx mov r257, l11.xxxx if_logicalnz r256.xxxx mov r256, l12.xxxx mov r258, l13.xxxx fence_lds_memory_gds mov r1011.x___, r258.xxxx mov r1010.x___, r256.xxxx gds_store_id(0) r1010.x, r1011.x ;****LOOSES ADDRESS INFO WHEN COMPILED TO BINARY fence_lds_memory_gds else endif mov r258, l14.xxxx fence_threads_memory_lds_gds_gws fence_threads_memory_lds_gds_gws fence_lds_memory_gds mov r1010.x___, r258.xxxx gds_load_id(0) r1011.x, r1010.x mov r259.x___, r1011.xxxx fence_lds_memory_gds mov r260, l12.xxxx fence_lds_memory_gds mov r1010.x___, r260.xxxx gds_load_id(0) r1011.x, r1010.x mov r260.x___, r1011.xxxx fence_lds_memory_gds fence_threads_memory_lds_gds_gws mov r257, l15.xxxx iand r253.x___, r253.xxxx, r257.xxxx ishl r253.x___, r253.xxxx, r255.xxxx mov r255, l16.xxxx iadd r253.x___, r255.xxxx, r253.xxxx mov r1010.x___, r253.xxxx lds_load_id(1) r1011.x, r1010.x mov r253.x___, r1011.xxxx mov r255, l17.xxxx ishl r254.x___, r254.xxxx, r255.xxxx iadd r254.x___, r1.xxxx, r254.xxxx mov r1010.x___, r254.xxxx uav_raw_load_id(11)_cached r1011, r1010.xxxx mov r255, r1011 iadd r253, r255.xy0w, r253.00x0 mov r255, l18.xxxx iadd r255.x___, r254.xxxx, r255.xxxx mov r1010.x___, r255.xxxx uav_raw_load_id(11)_cached r1011, r1010.xxxx mov r257, r1011 mov r1011, r253 mov r1010.x___, r254.xxxx uav_raw_store_id(11) mem, r1010.xxxx, r1011 iadd r253, r257.0yzw, r259.x000 iadd r253, r253.x0zw, r260.0x00 iadd r253, r253.xy0w, r258.00x0 mov r1011, r253 mov r1010.x___, r255.xxxx uav_raw_store_id(11) mem, r1010.xxxx, r1011 ret endfunc ; permute end