Archives Discussions

realhet · ‎04-16-2013

Hi,

I'm facing with the following problem: I have to use LDS for a relatively long time and also I need to gather/scatter data across all the LDS memory.

Scheduling more than one kernel is not an option because I'll have to do 1024 [paralell LDS jobs] interleaved with 1024 [LDS gather operations]. In the final thing I gonna need 192K [paralell LDS jobs] per second, so that really isn't the clEnqueue's area.

WorkGroupSize=64, Total WorkItems=4x gpu streams, all WorkItems fit in LDS: guaranteed.

I tried this way:

if(lid==0){

int dstCnt=LoopIdx*cb->GroupCnt; //value to wait for after all workgroups are done with the atomic_incs

atomic_inc(&(out->globalCntr)); //inc for this workgroup

while(out->globalCntr!=dstCnt){} //wait

}

But I'm totally not trusting this (because I don't know if caching can interfere this), and it's kinda slow.

Is there a way to use GDS for this?

Also as a side question: The gather operation will sum up float values. Is it a good idea to convert the floats to integers and sum then with atomic_adds? Or is there a way to atomically sum floats?

Thanks in advance!

drallan · ‎05-23-2013

And suddenly: I've found the ds_gws_barrier instructions. Unfortunately I haven't found any documentation about it. If anyone knows it please tell me how it works.I gonna check it soon. What if it can make global synch across ALL the waves present in CUes o.o That gonna be jackpot

Hi realhet,

Yes it does synchronize all waves across all CUs.

I tried many global synch schemes and finally went to (global wave sync) gws_barriers.

Below is a simple example from my c compiler (gcnc). (opencl doesn't support global synchronization.)

The method can be seen in the C code.

The ISA instruction syntax is shown in the assembly code output just below

The example was cut from a 2D wave function where output A --> B and output B --> A.

global synch is a must unless you re-issue the kernel each time, greatly reducing performance.

Basically

1. Barriers must be initialized each time they are used.

2. Be careful to initialize the first barrier with the first wave to arrive, global id=0 may not work

3. I always use alternating 2 or 3 barriers, initialize barrier n+1 just before hitting barrier n

4. Barriers are initialized with the total number of waves running.

5. This is what I do, no guarantee it's the best way.

Note the assembler allows gs_xxxx insts, just change to ds_xxxx.

#include "../gcnc.h 
#define BAR0 0                      //define  barrier IDs
#define BAR1 1
kernel void lccwave(
     __global float *restrict wav0,
     __global float *restrict vel0,
     __global unsigned int *restrict GLB,
    const int ops
    )
{
    register int gx,gy,gid,i,j,ret;
    gx=get_global_id(0);
    gy=get_global_id(1);
    gid=256*gy+gx;
    ret=atomic_inc(&GLB[0],999999);        //must find first wave in!!
    if(ret==0)gws_init(255,BAR0);              //first wave initializes barrier 0
    // [some code]
    for(j=0;j<32;j++){                     //---------------main loop
        // [block of code]
        if(gid==0)gws_init(255,BAR1);      //wave 0 initialzes barrier 1
        gws_barrier(BAR0);                 // hit barrier 0
       // [block of code]
        if(gid==0)gws_init(255,BAR0);      //wave 0 initialzes barrier 0
        gws_barrier(BAR1);                 // hit barrier 1
    }                                      //-------------end main loop
    gws_barrier(BAR0);                     //exit hit barrier 0 required
}

//-----------------------------------MARKED UP ASSEMBLY OUTPUT-----------------------

.user elms= 3

.user_dimsi= 2

.user_sgpr = 12

.user_dims = 2

.uax.uav12

.uax.uav10

.uax.uav13

.uax.uav11

.uax.uav9

.ue(0) PTR_UAV_TABLE slot= 0 s[2:3]

.ue(1) IMM_CONST_BUFFER slot= 0 s[4:7]

.ue(2) IMM_CONST_BUFFER slot= 1 s[8:11]

.file 2,"cl\lccwave_2buf.cl"

.nvgpr: 21

.nsgpr: 32

x_set_nvgpr 127 //assembler directives

x_set_nsgpr 104 //assembler directives

x_set_ldsmax 0x400 //assembler directives

s_mov_b32 m0,0xffff

s_movk_i32 s103, 21

s_movk_i32 s102, 32

s_buffer_load_dwordx2 s[0:1], s[4:7], 0x04

s_waitcnt lgkmcnt(0)

s_mul_i32 s0, s12, s0

v_add_i32 v7, vcc, s0, v0

v_mov_b32 v8, v0

s_mul_i32 s1, s13, s1

v_add_i32 v6, vcc, s1, v1

v_mov_b32 v9, v1

.uavp to s[10:11]

s_mov_b64 s[10:11], s[2:3]

s_load_dwordx4 s[16:19], s[10:11], 0x60

s_load_dwordx4 s[20:23], s[10:11], 0x50

s_buffer_load_dword s24, s[8:11], 0x00

s_buffer_load_dword s25, s[8:11], 0x04

s_buffer_load_dword s26, s[8:11], 0x08

s_buffer_load_dword s27, s[8:11], 0x0c

s_buffer_load_dword s28, s[8:11], 0x10

s_buffer_load_dword s29, s[8:11], 0x14

s_buffer_load_dword s30, s[8:11], 0x18

s_buffer_load_dword s31, s[8:11], 0x1c

s_waitcnt lgkmcnt(0)

v_mov_b32 v12, v7

v_mov_b32 v13, v6

v_lshlrev_b32 v18, 8, v13

v_add_i32 v14, vcc, v18, v12

s_load_dwordx4 s[20:23], s[10:11], 0x48

s_waitcnt lgkmcnt(0)

v_mov_b32 v0, 0xf423f

v_mov_b32 v1, s28

buffer_atomic_inc v0, v1, s[20:23], 0 offen glc // initial sync code

s_waitcnt vmcnt(0) // initial sync code

v_mov_b32 v17, v0 // initial sync code

s_mov_b64 s[32:33], exec // initial sync code

v_cmpx_eq_i32 vcc, 0, v17 // initial sync code

s_cbranch_execz label_2 // initial sync code

v_mov_b32 v1, 255 // barrier code before loop

gs_gws_init v1 offset0:0 // barrier code

s_waitcnt lgkmcnt(0) // barrier code

label_2:

s_mov_b64 exec, s[32:33]

s_mov_b64 s[32:33], exec

v_mov_b32 v16, 0

label_4:

s_mov_b64 s[34:35], exec

v_cmpx_eq_i32 vcc, 0, v14

s_cbranch_execz label_10

v_mov_b32 v1, 255 // barrier code mainloop

gs_gws_init v1 offset0:1 // barrier code

s_waitcnt lgkmcnt(0) // barrier code

label_10:

s_mov_b64 exec, s[34:35]

gs_gws_barrier offset0:0

s_waitcnt lgkmcnt(0)

s_mov_b64 s[34:35], exec

v_cmpx_eq_i32 vcc, 0, v14

s_cbranch_execz label_12

v_mov_b32 v1, 255 // barrier code main loop

gs_gws_init v1 offset0:0 // barrier code

s_waitcnt lgkmcnt(0) // barrier code

label_12:

s_mov_b64 exec, s[34:35]

gs_gws_barrier offset0:1

s_waitcnt lgkmcnt(0)

label_5:

v_add_i32 v16, vcc, 1, v16

v_cmpx_gt_i32 vcc, 32, v16

s_cbranch_execnz label_4

s_mov_b64 exec, s[32:33]

gs_gws_barrier offset0:0 // barrier code end

s_waitcnt lgkmcnt(0) // barrier code

label_1:

s_endpgm

.end lccwave

endbye..

View solution in original post

Archives Discussions

Global synchronization inside the kernel