2 Replies Latest reply on Jun 16, 2015 11:37 AM by jedwards

    Returned size of kernarg segment is not a multiple of 16

    zoomzoom

      According to the HSA specifications: "The size of the kernel's kernarg segment variables is the size required for the kernarg segment variables and padding, rounded up to be a multiple of 16. The alignment of the base address of the kernel's kernarg segment variables is the larger of 16 bytes and the maximum alignment of the kernel's kernarg segment variables." (HSA-PRM-1.02.pdf page).

       

      Also, HSA-Runtime-1.0.pdf page says that: "Alignment (in bytes) of the buffer used to pass arguments to the kernel, which is the maximum of 16 and the maximum alignment of any of the kernel arguments.". Which would imply that the kernel argument size should also be a multiple of max(16, max alignment of any of the kernel arguments).

       

      However, querying HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE does not seem to follow any of the above specifications. The size of the kernarg segment is always:

      max(alignment of the kernel arguments) * (kernel arguments count - 1) + sizeof(last kernel argument)

       

      For this reason, the vector_copy sample that comes with the HSA Runtime or CLOC does not work when adding new parameters to the kernel. The reason is that the updated args data structure and the kernarg segment have different sizes.

       

      Is the value returned by HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE incorrect or am I missing something in the HSA specifications?

       

      Below are some examples of values returned for kernarg segment size for kernels with different number of parameters:

      argssizeof(args)kernel parameterskernarg segment size
          struct __attribute__ ((aligned(16))) args_t {
              uint64_t global_offset_0;
              uint64_t global_offset_1;
              uint64_t global_offset_2;
              uint64_t printf_buffer;
              uint64_t vqueue_pointer;
              uint64_t aqlwrap_pointer;
              void* in;
              void* out;
          } args;
      
      64
      prog kernel &__OpenCL_vector_copy_kernel(
              kernarg_u64 %__global_offset_0,
              kernarg_u64 %__global_offset_1,
              kernarg_u64 %__global_offset_2,
              kernarg_u64 %__printf_buffer,
              kernarg_u64 %__vqueue_pointer,
              kernarg_u64 %__aqlwrap_pointer,
              kernarg_u64 %in,
              kernarg_u64 %out)
      
      64
          struct __attribute__ ((aligned(16))) args_t {
              uint64_t global_offset_0;
              uint64_t global_offset_1;
              uint64_t global_offset_2;
              uint64_t printf_buffer;
              uint64_t vqueue_pointer;
              uint64_t aqlwrap_pointer;
              void* in1;
              void* in2;
              void* out;
          } args;
      
      80
      prog kernel &__OpenCL_vector_copy_kernel(
              kernarg_u64 %__global_offset_0,
              kernarg_u64 %__global_offset_1,
              kernarg_u64 %__global_offset_2,
              kernarg_u64 %__printf_buffer,
              kernarg_u64 %__vqueue_pointer,
              kernarg_u64 %__aqlwrap_pointer,
              kernarg_u64 %in1,
              kernarg_u64 %in2,
              kernarg_u64 %out)
      
      72
          struct __attribute__ ((aligned(16))) args_t {
              uint64_t global_offset_0;
              uint64_t global_offset_1;
              uint64_t global_offset_2;
              uint64_t printf_buffer;
              uint64_t vqueue_pointer;
              uint64_t aqlwrap_pointer;
              void* in1;
              void* in2;
              void* out;
              int offset;
          } args;
      
      80
      prog kernel &__OpenCL_vector_copy_kernel(
              kernarg_u64 %__global_offset_0,
              kernarg_u64 %__global_offset_1,
              kernarg_u64 %__global_offset_2,
              kernarg_u64 %__printf_buffer,
              kernarg_u64 %__vqueue_pointer,
              kernarg_u64 %__aqlwrap_pointer,
              kernarg_u64 %in1,
              kernarg_u64 %in2,
              kernarg_u64 %out,
              kernarg_u32 %offset)
      
      76
          struct __attribute__ ((aligned(16))) args_t {
              uint64_t global_offset_0;
              uint64_t global_offset_1;
              uint64_t global_offset_2;
              uint64_t printf_buffer;
              uint64_t vqueue_pointer;
              uint64_t aqlwrap_pointer;
              void* in1;
              void* in2;
              void* in3;
              int offset;
              void* out;
          } args;
      
      96
      prog kernel &__OpenCL_vector_copy_kernel(
              kernarg_u64 %__global_offset_0,
              kernarg_u64 %__global_offset_1,
              kernarg_u64 %__global_offset_2,
              kernarg_u64 %__printf_buffer,
              kernarg_u64 %__vqueue_pointer,
              kernarg_u64 %__aqlwrap_pointer,
              kernarg_u64 %in1,
              kernarg_u64 %in2,
              kernarg_u64 %in3,
              kernarg_u32 %offset,
              kernarg_u64 %out)
      
      88