AnsweredAssumed Answered

read_image performance

Question asked by ivan on Feb 13, 2013
Latest reply on Feb 13, 2013 by ivan

Hi there,

 

Why does the read_imageui API always translated into 2 calls of sample_id()_sampler() with conditional branch? For example, this simple code

=========

__private int2 coords;

__read_only image2d_t img;

const sampler_t sampler=CLK_NORMALIZED_COORDS_FALSE|CLK_ADDRESS_NONE|CLK_FILTER_NEAREST;

read_imageui(img,sampler,coords);

=========

 

is compiled into the following sequence in IL file:

 

=========

dcl_literal l42, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF; f128:i128 1844674407370955161518446744073709551615

...

if_logicalnz l42

round_neginf r83, r83

mov r84.x___, l16

sample_id(0)_sampler(0)_coordtype(unnormalized) r83, r83

else

mov r84, cb1[0]

itof r84, r84

mov r84, r84

mov r88, r84

mov r89, l43

mov r84, l11

cmov_logical r88, r84, r88, r89

mov r88, r88

mul_ieee r88, r88, r83

mov r83, r88

mov r83, r83

round_neginf r88, r88

mov r88, r88

mov r88, r88

cmov_logical r83, r84, r83, r88

mov r83, r83

mov r84, cb1[1]

mov r84, r84

mul_ieee r83, r83, r84

mov r84.x___, l16

sample_id(0)_sampler(0)_coordtype(normalized) r83, r83

endif

==========

 

I'm porting CUDA code and have multiple calls of read_imageui in my kernel. But it runs much slower compared to CUDA. I wonder where's the bottleneck?

Outcomes