cancel
Showing results for 
Search instead for 
Did you mean: 

OpenCL

ktator
Adept I

Trouble with GDS reading and writing on Ellesmere GPU

I am trying to use GDS on AMD RX 580.
Listings are available here and on pastebin:
LDS version Assembler kernel (works fine) https://pastebin.com/uakfSBBi
GDS version Assembler kernel (works incorrectly) https://pastebin.com/rQS00Hf7

I started with OpenCL kernel and LDS:

__kernel __attribute__((reqd_work_group_size(64, 1, 1)))
void localVarExample(__global int *res)
{
int i = get_global_id(0);
__local int x[64];
x[i] = 0;
barrier(CLK_LOCAL_MEM_FENCE);
atom_inc(&x[i]);
barrier(CLK_LOCAL_MEM_FENCE);
res[i] = x[i];
} ‍‍‍‍‍‍‍‍‍‍‍

After that I built it and disassembled with CLRX (and build again):

.amdcl2
.gpu Iceland
.64bit
.arch_minor 0
.arch_stepping 4
.driver_version 203603
.compile_options "-fno-bin-source -fno-bin-llvmir -fno-bin-amdil -fbin-exe -D__AMD__=1 -D__Ellesmere__=1 -D__Ellesmere=1 -D__IMAGE_SUPPORT__=1 -DFP_FAST_FMA=1 -cl-denorms-are-zero -m64 -Dcl_khr_fp64=1 -Dcl_amd_fp64=1 -Dcl_khr_global_int32_base_atomics=1 -Dcl_khr_global_int32_extended_atomics=1 -Dcl_khr_local_int32_base_atomics=1 -Dcl_khr_local_int32_extended_atomics=1 -Dcl_khr_int64_base_atomics=1 -Dcl_khr_int64_extended_atomics=1 -Dcl_khr_3d_image_writes=1 -Dcl_khr_byte_addressable_store=1 -Dcl_khr_fp16=1 -Dcl_khr_gl_sharing=1 -Dcl_khr_gl_depth_images=1 -Dcl_amd_device_attribute_query=1 -Dcl_amd_vec3=1 -Dcl_amd_printf=1 -Dcl_amd_media_ops=1 -Dcl_amd_media_ops2=1 -Dcl_amd_popcnt=1 -Dcl_khr_d3d10_sharing=1 -Dcl_khr_d3d11_sharing=1 -Dcl_khr_dx9_media_sharing=1 -Dcl_khr_image2d_from_buffer=1 -Dcl_khr_spir=1 -Dcl_khr_subgroups=1 -Dcl_khr_gl_event=1 -Dcl_khr_depth_images=1 -Dcl_khr_mipmap_image=1 -Dcl_khr_mipmap_image_writes=1 -Dcl_amd_liquid_flash=1 -Dcl_amd_planar_yuv=1"
.acl_version "AMD-COMP-LIB-v0.8 (0.0.SC_BUILD_NUMBER)"
.kernel localVarExample
.config
.dims x
.cws 64, 1, 1
.sgprsnum 13
.vgprsnum 4
.localsize 256
.floatmode 0xc0
.pgmrsrc1 0x00ac0040
.pgmrsrc2 0x0000008c
.dx10clamp
.ieeemode
.useargs
.priority 0
.arg _.global_offset_0, "size_t", long
.arg _.global_offset_1, "size_t", long
.arg _.global_offset_2, "size_t", long
.arg _.printf_buffer, "size_t", void*, global, , rdonly
.arg _.vqueue_pointer, "size_t", long
.arg _.aqlwrap_pointer, "size_t", long
.arg res, "int*", int*, global,
.text
s_mov_b32 m0, 0x10000
s_lshl_b32 s0, s6, 6
v_add_u32 v0, vcc, s0, v0
s_load_dwordx2 s[0:1], s[4:5], 0x0
s_waitcnt lgkmcnt(0)
v_add_u32 v1, vcc, s0, v0
v_lshlrev_b32 v1, 2, v1
v_mov_b32 v2, 0
v_mov_b32 v3, 1
s_load_dwordx2 s[2:3], s[4:5], 0x30
ds_write_b32 v1, v2
s_waitcnt lgkmcnt(0)
ds_add_u32 v1, v3
s_waitcnt lgkmcnt(0)
ds_read_b32 v2, v1
v_add_u32 v1, s[4:5], v0, s0
v_mov_b32 v0, 0
v_ashrrev_i64 v[0:1], 30, v[0:1]
v_add_u32 v0, vcc, s2, v0
v_mov_b32 v3, s3
v_addc_u32 v1, vcc, v3, v1, vcc
s_waitcnt lgkmcnt(0)
flat_store_dword v[0:1], v2
s_endpgm‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍

I ran these kernels with with global size 64. Both kernels work fine and fill buffer res with ones.
After that I tried to change LDS to GDS. I set up m0 register with value oxFFFF and set gds flag for
ds instructions. The result kernel is following:

.amdcl2
.gpu Iceland
.64bit
.arch_minor 0
.arch_stepping 4
.driver_version 203603
.compile_options "-fno-bin-source -fno-bin-llvmir -fno-bin-amdil -fbin-exe -D__AMD__=1 -D__Ellesmere__=1 -D__Ellesmere=1 -D__IMAGE_SUPPORT__=1 -DFP_FAST_FMA=1 -cl-denorms-are-zero -m64 -Dcl_khr_fp64=1 -Dcl_amd_fp64=1 -Dcl_khr_global_int32_base_atomics=1 -Dcl_khr_global_int32_extended_atomics=1 -Dcl_khr_local_int32_base_atomics=1 -Dcl_khr_local_int32_extended_atomics=1 -Dcl_khr_int64_base_atomics=1 -Dcl_khr_int64_extended_atomics=1 -Dcl_khr_3d_image_writes=1 -Dcl_khr_byte_addressable_store=1 -Dcl_khr_fp16=1 -Dcl_khr_gl_sharing=1 -Dcl_khr_gl_depth_images=1 -Dcl_amd_device_attribute_query=1 -Dcl_amd_vec3=1 -Dcl_amd_printf=1 -Dcl_amd_media_ops=1 -Dcl_amd_media_ops2=1 -Dcl_amd_popcnt=1 -Dcl_khr_d3d10_sharing=1 -Dcl_khr_d3d11_sharing=1 -Dcl_khr_dx9_media_sharing=1 -Dcl_khr_image2d_from_buffer=1 -Dcl_khr_spir=1 -Dcl_khr_subgroups=1 -Dcl_khr_gl_event=1 -Dcl_khr_depth_images=1 -Dcl_khr_mipmap_image=1 -Dcl_khr_mipmap_image_writes=1 -Dcl_amd_liquid_flash=1 -Dcl_amd_planar_yuv=1"
.acl_version "AMD-COMP-LIB-v0.8 (0.0.SC_BUILD_NUMBER)"
.kernel localVarExample
.config
.dims x
.cws 64, 1, 1
.sgprsnum 13
.vgprsnum 4
.localsize 256
.floatmode 0xc0
.pgmrsrc1 0x00ac0040
.pgmrsrc2 0x0000008c
.dx10clamp
.ieeemode
.useargs
.priority 0
.arg _.global_offset_0, "size_t", long
.arg _.global_offset_1, "size_t", long
.arg _.global_offset_2, "size_t", long
.arg _.printf_buffer, "size_t", void*, global, , rdonly
.arg _.vqueue_pointer, "size_t", long
.arg _.aqlwrap_pointer, "size_t", long
.arg res, "int*", int*, global,
.text
s_mov_b32 m0, 0xffff
s_lshl_b32 s0, s6, 6
v_add_u32 v0, vcc, s0, v0
s_load_dwordx2 s[0:1], s[4:5], 0x0
s_waitcnt lgkmcnt(0)
v_add_u32 v1, vcc, s0, v0
v_lshlrev_b32 v1, 2, v1
v_mov_b32 v2, 0
v_mov_b32 v3, 1
s_load_dwordx2 s[2:3], s[4:5], 0x30
ds_write_b32 v1, v2 gds
s_waitcnt lgkmcnt(0)
ds_add_u32 v1, v3 gds
s_waitcnt lgkmcnt(0)
ds_read_b32 v2, v1 gds
v_add_u32 v1, s[4:5], v0, s0
v_mov_b32 v0, 0
v_ashrrev_i64 v[0:1], 30, v[0:1]
v_add_u32 v0, vcc, s2, v0
v_mov_b32 v3, s3
v_addc_u32 v1, vcc, v3, v1, vcc
s_waitcnt lgkmcnt(0)
flat_store_dword v[0:1], v2
s_endpgm‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍‍

I ran this kernel and it filled buffer res with zeroes instead of ones. I tried several variants of this kernel and at all times 

I got zeroes from GDS.

I ran these kernels in CodeXL and it showed that GDS version really uses GDS instructions and does not use LDS instructions.

I tried on Windows 10 and Ubuntu 18 with latest drivers and previous version and every time I got zeroes from GDS.

What I am doing wrong?

0 Likes
4 Replies
kbala
Adept I

Which disassembly options do you use?

0 Likes

kbala

clrxdisasm with -mdCfs .

clrxasm with a single option -o to set output file.

0 Likes
lolliedieb
Adept II

I have to admit I got not experience with the GDS under Windows.
But in Linux there is a difference in the size of the usable GDS between rocm drivers and the amdgpu-pro ones. I personally tried to use a GDS fraction of 32k (0x8000 to m0) and it works well on rocm, while amdgpu-pro only returns 0 when trying to access it.  The maximum value I was able to put to m0 with the gpu-pro was 0x1000, so 4kByte - then it seems to return correct values. Could you test if similar restriction apply to your Windows drivers?

I tested with different values (from 0x100 to 0xffff to m0) but it always returns zeros.

0 Likes