cancel
Showing results for 
Search instead for 
Did you mean: 

Archives Discussions

zoomzoom
Adept I

Returned size of kernarg segment is not a multiple of 16

According to the HSA specifications: "The size of the kernel's kernarg segment variables is the size required for the kernarg segment variables and padding, rounded up to be a multiple of 16. The alignment of the base address of the kernel's kernarg segment variables is the larger of 16 bytes and the maximum alignment of the kernel's kernarg segment variables." (HSA-PRM-1.02.pdf page).

Also, HSA-Runtime-1.0.pdf page says that: "Alignment (in bytes) of the buffer used to pass arguments to the kernel, which is the maximum of 16 and the maximum alignment of any of the kernel arguments.". Which would imply that the kernel argument size should also be a multiple of max(16, max alignment of any of the kernel arguments).

However, querying HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE does not seem to follow any of the above specifications. The size of the kernarg segment is always:

max(alignment of the kernel arguments) * (kernel arguments count - 1) + sizeof(last kernel argument)

For this reason, the vector_copy sample that comes with the HSA Runtime or CLOC does not work when adding new parameters to the kernel. The reason is that the updated args data structure and the kernarg segment have different sizes.

Is the value returned by HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE incorrect or am I missing something in the HSA specifications?

Below are some examples of values returned for kernarg segment size for kernels with different number of parameters:

argssizeof(args)kernel parameterskernarg segment size
    struct __attribute__ ((aligned(16))) args_t {
        uint64_t global_offset_0;
        uint64_t global_offset_1;
        uint64_t global_offset_2;
        uint64_t printf_buffer;
        uint64_t vqueue_pointer;
        uint64_t aqlwrap_pointer;
        void* in;
        void* out;
    } args;
64
prog kernel &__OpenCL_vector_copy_kernel(
        kernarg_u64 %__global_offset_0,
        kernarg_u64 %__global_offset_1,
        kernarg_u64 %__global_offset_2,
        kernarg_u64 %__printf_buffer,
        kernarg_u64 %__vqueue_pointer,
        kernarg_u64 %__aqlwrap_pointer,
        kernarg_u64 %in,
        kernarg_u64 %out)
64
    struct __attribute__ ((aligned(16))) args_t {
        uint64_t global_offset_0;
        uint64_t global_offset_1;
        uint64_t global_offset_2;
        uint64_t printf_buffer;
        uint64_t vqueue_pointer;
        uint64_t aqlwrap_pointer;
        void* in1;
        void* in2;
        void* out;
    } args;
80
prog kernel &__OpenCL_vector_copy_kernel(
        kernarg_u64 %__global_offset_0,
        kernarg_u64 %__global_offset_1,
        kernarg_u64 %__global_offset_2,
        kernarg_u64 %__printf_buffer,
        kernarg_u64 %__vqueue_pointer,
        kernarg_u64 %__aqlwrap_pointer,
        kernarg_u64 %in1,
        kernarg_u64 %in2,
        kernarg_u64 %out)
72
    struct __attribute__ ((aligned(16))) args_t {
        uint64_t global_offset_0;
        uint64_t global_offset_1;
        uint64_t global_offset_2;
        uint64_t printf_buffer;
        uint64_t vqueue_pointer;
        uint64_t aqlwrap_pointer;
        void* in1;
        void* in2;
        void* out;
        int offset;
    } args;
80
prog kernel &__OpenCL_vector_copy_kernel(
        kernarg_u64 %__global_offset_0,
        kernarg_u64 %__global_offset_1,
        kernarg_u64 %__global_offset_2,
        kernarg_u64 %__printf_buffer,
        kernarg_u64 %__vqueue_pointer,
        kernarg_u64 %__aqlwrap_pointer,
        kernarg_u64 %in1,
        kernarg_u64 %in2,
        kernarg_u64 %out,
        kernarg_u32 %offset)
76
    struct __attribute__ ((aligned(16))) args_t {
        uint64_t global_offset_0;
        uint64_t global_offset_1;
        uint64_t global_offset_2;
        uint64_t printf_buffer;
        uint64_t vqueue_pointer;
        uint64_t aqlwrap_pointer;
        void* in1;
        void* in2;
        void* in3;
        int offset;
        void* out;
    } args;
96
prog kernel &__OpenCL_vector_copy_kernel(
        kernarg_u64 %__global_offset_0,
        kernarg_u64 %__global_offset_1,
        kernarg_u64 %__global_offset_2,
        kernarg_u64 %__printf_buffer,
        kernarg_u64 %__vqueue_pointer,
        kernarg_u64 %__aqlwrap_pointer,
        kernarg_u64 %in1,
        kernarg_u64 %in2,
        kernarg_u64 %in3,
        kernarg_u32 %offset,
        kernarg_u64 %out)
88
0 Likes
1 Solution
jedwards
Staff

This is fixed in release version release-v1.0f.2.

View solution in original post

0 Likes
2 Replies
jedwards
Staff

You are not missing anything. There is a defect in the has runtime that will be corrected in a future release.

jedwards
Staff

This is fixed in release version release-v1.0f.2.

0 Likes