AnsweredAssumed Answered

read_image performance

Question asked by ivan on Feb 13, 2013
Latest reply on Feb 13, 2013 by ivan

Hi there,


Why does the read_imageui API always translated into 2 calls of sample_id()_sampler() with conditional branch? For example, this simple code


__private int2 coords;

__read_only image2d_t img;





is compiled into the following sequence in IL file:



dcl_literal l42, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF; f128:i128 1844674407370955161518446744073709551615


if_logicalnz l42

round_neginf r83, r83

mov r84.x___, l16

sample_id(0)_sampler(0)_coordtype(unnormalized) r83, r83


mov r84, cb1[0]

itof r84, r84

mov r84, r84

mov r88, r84

mov r89, l43

mov r84, l11

cmov_logical r88, r84, r88, r89

mov r88, r88

mul_ieee r88, r88, r83

mov r83, r88

mov r83, r83

round_neginf r88, r88

mov r88, r88

mov r88, r88

cmov_logical r83, r84, r83, r88

mov r83, r83

mov r84, cb1[1]

mov r84, r84

mul_ieee r83, r83, r84

mov r84.x___, l16

sample_id(0)_sampler(0)_coordtype(normalized) r83, r83




I'm porting CUDA code and have multiple calls of read_imageui in my kernel. But it runs much slower compared to CUDA. I wonder where's the bottleneck?