Hi,
I'm facing with the following problem: I have to use LDS for a relatively long time and also I need to gather/scatter data across all the LDS memory.
Scheduling more than one kernel is not an option because I'll have to do 1024 [paralell LDS jobs] interleaved with 1024 [LDS gather operations]. In the final thing I gonna need 192K [paralell LDS jobs] per second, so that really isn't the clEnqueue's area.
WorkGroupSize=64, Total WorkItems=4x gpu streams, all WorkItems fit in LDS: guaranteed.
I tried this way:
if(lid==0){
int dstCnt=LoopIdx*cb->GroupCnt; //value to wait for after all workgroups are done with the atomic_incs
atomic_inc(&(out->globalCntr)); //inc for this workgroup
while(out->globalCntr!=dstCnt){} //wait
}
But I'm totally not trusting this (because I don't know if caching can interfere this), and it's kinda slow.
Is there a way to use GDS for this?
Also as a side question: The gather operation will sum up float values. Is it a good idea to convert the floats to integers and sum then with atomic_adds? Or is there a way to atomically sum floats?
Thanks in advance!
Solved! Go to Solution.
And suddenly: I've found the ds_gws_barrier instructions. Unfortunately I haven't found any documentation about it. If anyone knows it please tell me how it works.I gonna check it soon. What if it can make global synch across ALL the waves present in CUes o.o That gonna be jackpot
Hi realhet,
Yes it does synchronize all waves across all CUs.
I tried many global synch schemes and finally went to (global wave sync) gws_barriers.
Below is a simple example from my c compiler (gcnc). (opencl doesn't support global synchronization.)
The method can be seen in the C code.
The ISA instruction syntax is shown in the assembly code output just below
The example was cut from a 2D wave function where output A --> B and output B --> A.
global synch is a must unless you re-issue the kernel each time, greatly reducing performance.
Basically
1. Barriers must be initialized each time they are used.
2. Be careful to initialize the first barrier with the first wave to arrive, global id=0 may not work
3. I always use alternating 2 or 3 barriers, initialize barrier n+1 just before hitting barrier n
4. Barriers are initialized with the total number of waves running.
5. This is what I do, no guarantee it's the best way.
Note the assembler allows gs_xxxx insts, just change to ds_xxxx.
#include "../gcnc.h
#define BAR0 0 //define barrier IDs
#define BAR1 1
kernel void lccwave(
__global float *restrict wav0,
__global float *restrict vel0,
__global unsigned int *restrict GLB,
const int ops
)
{
register int gx,gy,gid,i,j,ret;
gx=get_global_id(0);
gy=get_global_id(1);
gid=256*gy+gx;
ret=atomic_inc(&GLB[0],999999); //must find first wave in!!
if(ret==0)gws_init(255,BAR0); //first wave initializes barrier 0
// [some code]
for(j=0;j<32;j++){ //---------------main loop
// [block of code]
if(gid==0)gws_init(255,BAR1); //wave 0 initialzes barrier 1
gws_barrier(BAR0); // hit barrier 0
// [block of code]
if(gid==0)gws_init(255,BAR0); //wave 0 initialzes barrier 0
gws_barrier(BAR1); // hit barrier 1
} //-------------end main loop
gws_barrier(BAR0); //exit hit barrier 0 required
}
//-----------------------------------MARKED UP ASSEMBLY OUTPUT-----------------------
.user elms= 3
.user_dimsi= 2
.user_sgpr = 12
.user_dims = 2
.uax.uav12
.uax.uav10
.uax.uav13
.uax.uav11
.uax.uav9
.ue(0) PTR_UAV_TABLE slot= 0 s[2:3]
.ue(1) IMM_CONST_BUFFER slot= 0 s[4:7]
.ue(2) IMM_CONST_BUFFER slot= 1 s[8:11]
.file 2,"cl\lccwave_2buf.cl"
.nvgpr: 21
.nsgpr: 32
x_set_nvgpr 127 //assembler directives
x_set_nsgpr 104 //assembler directives
x_set_ldsmax 0x400 //assembler directives
s_mov_b32 m0,0xffff
s_movk_i32 s103, 21
s_movk_i32 s102, 32
s_buffer_load_dwordx2 s[0:1], s[4:7], 0x04
s_waitcnt lgkmcnt(0)
s_mul_i32 s0, s12, s0
v_add_i32 v7, vcc, s0, v0
v_mov_b32 v8, v0
s_mul_i32 s1, s13, s1
v_add_i32 v6, vcc, s1, v1
v_mov_b32 v9, v1
.uavp to s[10:11]
s_mov_b64 s[10:11], s[2:3]
s_load_dwordx4 s[16:19], s[10:11], 0x60
s_load_dwordx4 s[20:23], s[10:11], 0x50
s_buffer_load_dword s24, s[8:11], 0x00
s_buffer_load_dword s25, s[8:11], 0x04
s_buffer_load_dword s26, s[8:11], 0x08
s_buffer_load_dword s27, s[8:11], 0x0c
s_buffer_load_dword s28, s[8:11], 0x10
s_buffer_load_dword s29, s[8:11], 0x14
s_buffer_load_dword s30, s[8:11], 0x18
s_buffer_load_dword s31, s[8:11], 0x1c
s_waitcnt lgkmcnt(0)
v_mov_b32 v12, v7
v_mov_b32 v13, v6
v_lshlrev_b32 v18, 8, v13
v_add_i32 v14, vcc, v18, v12
s_load_dwordx4 s[20:23], s[10:11], 0x48
s_waitcnt lgkmcnt(0)
v_mov_b32 v0, 0xf423f
v_mov_b32 v1, s28
buffer_atomic_inc v0, v1, s[20:23], 0 offen glc // initial sync code
s_waitcnt vmcnt(0) // initial sync code
v_mov_b32 v17, v0 // initial sync code
s_mov_b64 s[32:33], exec // initial sync code
v_cmpx_eq_i32 vcc, 0, v17 // initial sync code
s_cbranch_execz label_2 // initial sync code
v_mov_b32 v1, 255 // barrier code before loop
gs_gws_init v1 offset0:0 // barrier code
s_waitcnt lgkmcnt(0) // barrier code
label_2:
s_mov_b64 exec, s[32:33]
s_mov_b64 s[32:33], exec
v_mov_b32 v16, 0
label_4:
s_mov_b64 s[34:35], exec
v_cmpx_eq_i32 vcc, 0, v14
s_cbranch_execz label_10
v_mov_b32 v1, 255 // barrier code mainloop
gs_gws_init v1 offset0:1 // barrier code
s_waitcnt lgkmcnt(0) // barrier code
label_10:
s_mov_b64 exec, s[34:35]
gs_gws_barrier offset0:0
s_waitcnt lgkmcnt(0)
s_mov_b64 s[34:35], exec
v_cmpx_eq_i32 vcc, 0, v14
s_cbranch_execz label_12
v_mov_b32 v1, 255 // barrier code main loop
gs_gws_init v1 offset0:0 // barrier code
s_waitcnt lgkmcnt(0) // barrier code
label_12:
s_mov_b64 exec, s[34:35]
gs_gws_barrier offset0:1
s_waitcnt lgkmcnt(0)
label_5:
v_add_i32 v16, vcc, 1, v16
v_cmpx_gt_i32 vcc, 32, v16
s_cbranch_execnz label_4
s_mov_b64 exec, s[32:33]
gs_gws_barrier offset0:0 // barrier code end
s_waitcnt lgkmcnt(0) // barrier code
label_1:
s_endpgm
.end lccwave
endbye..