I've created small IL program to test how "uav_cached" works. On linux ( ubuntu maverick 64 bit ) with driver 11.3 and 11.4 it returns almost random* values.
Example has uav0 as input. Changing to uav11 doesn't improve anything. With driver 11.3 uav11 works without _cached flag, with 11.4 it gives always random* values.
* Driver 11.3 returns (int)1, Driver 11.4 returns a lot of (int)1 and some random values
CAL++ kernel void kernel_A1() { uav_raw<uint4> in(0,UAV_CACHED); uav_raw<uint4> out(1); uint1 gid = get_global_id(0); uint4 tmp; tmp = in[16*gid]; out[16*gid] = tmp+1; } generated IL il_cs dcl_num_thread_per_group 128 dcl_raw_uav_id(0) dcl_raw_uav_id(1) dcl_literal l0, 0x0, 0x0, 0x0, 0x0 dcl_literal l3, 0x1, 0x1, 0x1, 0x1 dcl_literal l2, 0x10, 0x0, 0x0, 0x0 dcl_literal l1, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff mov r1.x,vAbsTid.x umul r4.x,l2.x,r1.x uav_raw_load_id(0)_cached r7.xyzw,r4.x mov r8.xyzw,r7.xyzw mov r2,r8 iadd r11,r2,l3 umul r15.x,l2.x,r1.x uav_raw_store_id(1) mem.xyzw,r15.x,r11 end
It looks that that for driver 11.9 ( linux ) cached uav reads still don't work. The kernel always read 0 from uav.
Can you pls tell me when this issue is going to be solved ? Or if you claim that cached uavs are working please post working IL kernel.
CL: kernel void caching(global int4 *in, global int4 *out) { size_t gid = get_global_id(0); int4 tmp = in[16 * gid]; out[16 * gid] = tmp; } IL: il_cs_2_0 dcl_cb cb0[15] ; Constant buffer that holds ABI data dcl_literal l0, 0x00000004, 0x00000001, 0x00000002, 0x00000003 dcl_literal l1, 0x00FFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0xFFFFFFFD dcl_literal l2, 0x0000FFFF, 0xFFFFFFFE, 0x000000FF, 0xFFFFFFFC dcl_literal l3, 0x00000018, 0x00000010, 0x00000008, 0xFFFFFFFF dcl_literal l4, 0xFFFFFF00, 0xFFFF0000, 0xFF00FFFF, 0xFFFF00FF dcl_literal l5, 0x00000000, 0x00000004, 0x00000008, 0x0000000C dcl_literal l6, 0x00000020, 0x00000020, 0x00000020, 0x00000020 dcl_literal l7, 0x00000018, 0x0000001F, 0x00000010, 0x0000001F dcl_literal l8, 0x80000000, 0x80000000, 0x80000000, 0x80000000 call 1024;$ endmain func 1024 ; __OpenCL_caching_kernel mov r1013, cb0[8].x mov r1019, l1.0000 dcl_max_thread_per_group 256 dcl_raw_uav_id(11) dcl_arena_uav_id(8) mov r0.__z_, vThreadGrpIdFlat0.x mov r1022.xyz0, vTidInGrp0.xyz mov r1023.xyz0, vThreadGrpId0.xyz imad r1021.xyz0, r1023.xyzz, cb0[1].xyzz, r1022.xyzz iadd r1021.xyz0, r1021.xyz0, cb0[6].xyz0 iadd r1023.xyz0, r1023.xyz0, cb0[7].xyz0 mov r1023.___w, r0.z ishl r1023.___w, r1023.w, l0.z mov r1018.x___, l0.0000 udiv r1024.xyz_, r1021.xyzz, cb0[10].xyzz imad r1025.xyz0, r1023.xyzz, cb0[10].xyzz, r1022.xyzz dcl_literal l11, 0x00000008, 0x00000008, 0x00000008, 0x00000008; f32:i32 8 dcl_cb cb1[2] ; Kernel arg setup: in mov r1.x, cb1[0].x ; Kernel arg setup: out mov r1.y, cb1[1].x call 1027 ; caching ret endfunc ; __OpenCL_caching_kernel func 1027 ; caching ; @__OpenCL_caching_kernel ; BB#0: ; %entry mov r66, r1021.xyz0 mov r65.__z_, r66.00x0 mov r65.___w, l11 ishl r65.__z_, r65.z, r65.w iadd r65.x___, r1.y, r65.z iadd r65._y__, r1.x, r65.z mov r1010.x___, r65.y uav_raw_load_id(11)_cached r1011, r1010 mov r66, r1011 mov r1011, r66 mov r1010.x___, r65.x uav_raw_store_id(11) mem0, r1010.x, r1011 ret endfunc ; caching end ISA: ; -------- Disassembly -------------------- 00 ALU: ADDR(32) CNT(10) KCACHE0(CB0:0-15) KCACHE1(CB1:0-15) 0 t: MULLO_INT ____, R1.x, KC0[1].x 1 z: ADD_INT ____, R0.x, PS0 2 y: ADD_INT ____, PV1.z, KC0[6].x 3 w: LSHL ____, PV2.y, 8 4 y: ADD_INT ____, KC1[0].x, PV3.w z: ADD_INT ____, KC1[1].x, PV3.w 5 x: LSHR R0.x, PV4.y, 4 t: LSHR R1.x, PV4.z, 2 01 TEX: ADDR(48) CNT(1) 6 VFETCH R0, R0.x, fc175 FORMAT(32_32_32_32_FLOAT) MEGA(16) FETCH_TYPE(NO_INDEX_OFFSET) 02 MEM_RAT_CACHELESS_STORE_RAW: RAT(11)[R1], R0, ARRAY_SIZE(4) MARK VPM END_OF_PROGRAM