For example, I want to calculate CRC32 by 16*2 precomputed array (located in indexed_temp_array). It can be done by 256*1 precomputed array, but I want to fit in registers. Why the compiler generates lines 3-6, 15-23, 27-29? Why

MOV R0.x, (0x076DC419, 1.788752624e-34f).x

and

15 x: MOV R16.x, R0.x

but not

MOV R16.x, (0x076DC419, 1.788752624e-34f).x

IL: il_cs_2_0 dcl_num_thread_per_group 64 dcl_indexed_temp_array x0[16] dcl_literal l0,0x00000000,0x00000000,0,0 dcl_literal l1,0x77073096,0x1db71064,0,0 dcl_literal l2,0xee0e612c,0x3b6e20c8,0,0 dcl_literal l3,0x990951ba,0x26d930ac,0,0 dcl_literal l4,0x076dc419,0x76dc4190,0,0 dcl_literal l5,0x706af48f,0x6b6b51f4,0,0 dcl_literal l6,0xe963a535,0x4db26158,0,0 dcl_literal l7,0x9e6495a3,0x5005713c,0,0 dcl_literal l8,0x0edb8832,0xedb88320,0,0 dcl_literal l9,0x79dcb8a4,0xf00f9344,0,0 dcl_literal l10,0xe0d5e91e,0xd6d6a3e8,0,0 dcl_literal l11,0x97d2d988,0xcb61b38c,0,0 dcl_literal l12,0x09b64c2b,0x9b64c2b0,0,0 dcl_literal l13,0x7eb17cbd,0x86d3d2d4,0,0 dcl_literal l14,0xe7b82d07,0xa00ae278,0,0 dcl_literal l15,0x90bf1d91,0xbdbdf21c,0,0 dcl_literal l20,8,4,15,0xffffffff mov x0[0].xy,l0.xy mov x0[1].xy,l1.xy mov x0[2].xy,l2.xy mov x0[3].xy,l3.xy mov x0[4].xy,l4.xy mov x0[5].xy,l5.xy mov x0[6].xy,l6.xy mov x0[7].xy,l7.xy mov x0[8].xy,l8.xy mov x0[9].xy,l9.xy mov x0[10].xy,l10.xy mov x0[11].xy,l11.xy mov x0[12].xy,l12.xy mov x0[13].xy,l13.xy mov x0[14].xy,l14.xy mov x0[15].xy,l15.xy mov r1.x,l20.w ixor r5.x,r1.x,vAbsTidFlat.x ushr r1.x,r1.x,l20.x ubit_extract r4.x,r5.x,l20.y,l20.y iand r3.x,r5.x,l20.z mov r6.x,x0[r4.x].y mov r7.x,x0[r3.x].x ixor r1.x,r1.x,r6.x ixor r1.x,r1.x,r7.x mov g[vAbsTidFlat.x].x,r1.x end ============================================= ISA: ; -------- Disassembly -------------------- 00 ALU: ADDR(32) CNT(26) 0 x: MOV R5.x, 0.0f y: MOV R2.y, 0.0f z: LSHL ____, R0.z, (0x00000006, 8.407790786e-45f).x w: LSHL ____, R0.y, (0x00000006, 8.407790786e-45f).x t: MOV R2.x, 0.0f 1 x: MOV R3.x, (0x77073096, 2.741974671e33f).x y: MOV R3.y, (0x1DB71064, 4.845664418e-21f).y z: ADD_INT ____, PV0.z, PV0.w t: MOV R4.x, (0xEE0E612C, -1.101608625e28f).z 2 x: MOV R0.x, (0x990951BA, -7.099238543e-24f).x y: MOV R4.y, (0x3B6E20C8, 0.003633545712f).y w: ADD_INT R0.w, R0.x, PV1.z t: MOV R0.y, (0x26D930AC, 1.507059231e-15f).z 3 x: MOV R12.x, R2.x y: MOV R12.y, R2.y 4 x: MOV R13.x, R3.x y: MOV R13.y, R3.y 5 x: MOV R14.x, R4.x y: MOV R14.y, R4.y 6 x: MOV R15.x, R0.x y: MOV R15.y, R0.y 01 TEX: ADDR(144) CNT(1) 7 VFETCH R5.xy__, R5.x, fc147 MEGA(8) FETCH_TYPE(NO_INDEX_OFFSET) 02 ALU: ADDR(58) CNT(82) 8 x: MOV R0.x, (0x076DC419, 1.788752624e-34f).x y: MOV R0.y, (0x76DC4190, 2.233662255e33f).y t: MULLO_UINT ____, R1.z, R5.x 9 x: MOV R4.x, (0x706AF48F, 2.908605820e29f).x y: MOV R4.y, (0x6B6B51F4, 2.844845803e26f).y t: MULLO_UINT T0.z, PS8, R5.y 10 x: MOV R5.x, (0xE963A535, -1.720039557e25f).x y: MOV R5.y, (0x4DB26158, 374090496.0f).y t: MULLO_UINT ____, R1.y, R5.x 11 x: MOV R3.x, (0x9E6495A3, -1.210116365e-20f).x y: MOV R3.y, (0x5005713C, 8955162624f).y z: ADD_INT ____, T0.z, PS10 t: MOV R2.x, (0x0EDB8832, 5.411881952e-30f).z 12 x: MOV R1.x, (0x79DCB8A4, 1.432562832e35f).x y: MOV R2.y, (0xEDB88320, -7.137970276e27f).y w: ADD_INT ____, R1.x, PV11.z t: MOV R1.y, (0xF00F9344, -1.777375585e29f).z 13 x: MOV R6.x, (0xE0D5E91E, -1.233110735e20f).x y: MOV R6.y, (0xD6D6A3E8, -1.179997302e14f).y z: LSHL ____, PV12.w, (0x00000006, 8.407790786e-45f).z t: MOV R7.x, (0x97D2D988, -1.362584453e-24f).w 14 x: MOV R8.x, (0x09B64C2B, 4.388653425e-33f).x y: MOV R7.y, (0xCB61B38C, -14791564.0f).y w: ADD_INT R0.w, R0.w, PV13.z t: MOV R8.y, (0x9B64C2B0, -1.892262489e-22f).z 15 x: MOV R16.x, R0.x y: MOV R16.y, R0.y 16 x: MOV R17.x, R4.x y: MOV R17.y, R4.y 17 x: MOV R18.x, R5.x y: MOV R18.y, R5.y 18 x: MOV R19.x, R3.x y: MOV R19.y, R3.y 19 x: MOV R20.x, R2.x y: MOV R20.y, R2.y 20 x: MOV R21.x, R1.x y: MOV R21.y, R1.y 21 x: MOV R22.x, R6.x y: MOV R22.y, R6.y 22 x: MOV R23.x, R7.x y: MOV R23.y, R7.y 23 x: MOV R24.x, R8.x y: MOV R24.y, R8.y 24 x: MOV R8.x, (0x7EB17CBD, 1.179605167e38f).x y: MOV R8.y, (0x86D3D2D4, -7.967916265e-35f).y z: XOR_INT T0.z, -1, R0.w t: MOV R7.x, (0xE7B82D07, -1.739492081e24f).z 25 x: MOV R6.x, (0x90BF1D91, -7.538177071e-29f).x y: MOV R7.y, (0xA00AE278, -1.176398690e-19f).y t: MOV R6.y, (0xBDBDF21C, -0.09274694324f).z 26 x: BFE_UINT R1.x, (0x00000004, 5.605193857e-45f).x, (0x00000004, 5.605193857e-45f).x, T0.z t: AND_INT R2.x, T0.z, (0x0000000F, 2.101947696e-44f).y 27 x: MOV R25.x, R8.x y: MOV R25.y, R8.y 28 x: MOV R26.x, R7.x y: MOV R26.y, R7.y 29 x: MOV R27.x, R6.x y: MOV R27.y, R6.y 30 x: MOVA_INT ____, R1.x 31 y: MOV R6.y, R12[A0.x].y 32 x: MOVA_INT ____, R2.x 33 x: MOV R2.x, R12[A0.x].x 34 x: LSHL R1.x, R0.w, (0x00000002, 2.802596929e-45f).x y: XOR_INT ____, R6.y, (0x00FFFFFF, 2.350988562e-38f).y 35 x: XOR_INT R2.x, R2.x, PV34.y 03 MEM_EXPORT_WRITE_IND: DWORD_PTR[0+R1.x].x___, R2, ELEM_SIZE(3) END_OF_PROGRAM

Your 16 ALU ops are free compared to the cost of a single access to a temp array(which is hundreds-thousands of cycles). The shader compiler is optimizing your program for you because it is written inefficiently. A more optimal way to do this is to use a constant buffer and setup the literals on the host side.