michelrouzic

OpenCL LLVM quits when code accesses float3 elements

Discussion created by michelrouzic on Jun 7, 2020
Latest reply on Jun 10, 2020 by michelrouzic

When I try to compile my OpenCL code, clBuildProgram() only prints

LLVM ERROR: call to an undefined function "__floatuntisf"

and exits the whole program! I'm not able to get the compilation log because the program exists during the execution of clBuildProgram(). It seems that the problem has to do with my code that accesses elements from a float3 to put it in a float4, however this isn't very clear. I tried making a really succinct code that would reproduce the problem to no avail, so instead the best I can do is produce a cut-down version of my code that reproduces the problem. It seems like a compiler bug to me, like it produces a call to __floatuntisf() but can't link it. The problem seems to be in the lines 110 to 113.

 

I'm using the Radeon software version 20.5.1 but I also had the problem with an older version from February. GPU is an RX 570.

 

float Lab_L_to_linear(float t)
{
const float stn=6.f/29.f;
t = (t+0.16f) / 1.16f;
if (t > stn)
return t*t*t;
else
return 3.f*stn*stn*(t - 4.f/29.f);
}
float linear_to_Lab_L(float t)
{
const float thr = 6.f/29.f, thr3 = thr*thr*thr;
if (t > thr3)
t = cbrt(t);
else
t = t * 841.f/108.f + 4.f/29.f;
return 1.16f * t - 0.16f;
}
float hue_to_channel(float oh)
{
float t;
oh -= 3.f*floor((oh+1.f) * (1.f/3.f));
t = fabs(clamp(oh, -1.f, 1.f));
if (t <= 0.5f)
return 1.f;
else
return Lab_L_to_linear(2.f * (1.f - t));
}
float3 hsl_to_rgb_cw(float3 w, float3 hsl)
{
float3 rgb, rgbw;
float Y;
float4 rgba;
rgb.x = hue_to_channel(hsl.x);
rgb.y = hue_to_channel(hsl.x-1.f);
rgb.z = hue_to_channel(hsl.x-2.f);
rgbw = rgb * w;
Y = rgbw.x + rgbw.y + rgbw.z;
Y = hsl.z / Y;
rgb *= Y;
rgb = mix(rgb, hsl.z, hsl.y);
return rgb;
}
int idiv_ceil(int a, int b)
{
int d = a / b;
if (d*b < a)
d += 1;
return d;
}
ulong get_bits_in_stream(global uchar *stream, ulong start_bit, uint bit_count)
{
ulong r=0, b, start_byte, actual_start_bit;
int bits_to_read, b_sh;
uchar mask;
if (bit_count==0)
return 0;
start_byte = start_bit >> 3;
start_bit &= 7;
bits_to_read = min((int) (8-start_bit), (int) bit_count);
b_sh = max((int) 0, (int) (bit_count - bits_to_read));
while (bit_count > 0)
{
bits_to_read = min((int) (8-start_bit), (int) bit_count);
actual_start_bit = 8-start_bit - bits_to_read;
mask = (((1<<bits_to_read)-1) << actual_start_bit);
b = (stream[start_byte] & mask) >> actual_start_bit;
r |= b << b_sh;
b_sh = max((int) 0, (int) (b_sh-8));
bit_count -= bits_to_read;
start_bit = 0;
start_byte++;
}
return r;
}
ulong get_bits_in_stream_inc(global uchar *stream, ulong *start_bit, uint bit_count)
{
ulong r = get_bits_in_stream(stream, *start_bit, bit_count);
*start_bit += bit_count;
return r;
}
uint bits_to_mask(uint bits)
{
return (1UL << bits) - 1;
}
float bits_to_mul(uint bits)
{
return (float) ((1ULL << bits) - 1);
}
float3 compr_hsl_to_float3(int bits_ch, int bits_cs, int bits_cl, int ih, int is, int il)
{
float3 hsl;
hsl.x = (float) ih / (bits_to_mul(bits_ch)-1.f);
hsl.y = (float) (is+1) / (bits_to_mul(bits_cs)+1.f);
hsl.z = (float) il / bits_to_mul(bits_cl);
if (ih == bits_to_mask(bits_ch))
{
hsl.x = 0.f;
hsl.y = 0.f;
}
return hsl;
}
float4 compr_hsl_to_float4(float3 hsl)
{
float4 rgb;
float3 w = (float3) (0.124f, 0.686f, 0.19f), rgb3;
hsl.x *= 3.f;
hsl.z = Lab_L_to_linear(hsl.z);
rgb3 = hsl_to_rgb_cw(w, hsl);
rgb.x = linear_to_Lab_L(rgb3.x);
rgb.y = linear_to_Lab_L(rgb3.y);
rgb.z = linear_to_Lab_L(rgb3.z);
rgb.w = 1.f;
return rgb;
}
float4 read_compressed_texture1_pixel(global uchar *d8, int2 im_dim, int2 i)
{
global ushort *d16 = (global ushort *) d8;
float4 pv;
ulong di, blocks_start = 80;
int block_size, bits_per_block, quincunx, bits_ch, bits_cs, bits_cl, bits_per_pixel;
int linew0, linew1, line_count0, line_count1;
int h0, s0, l0, h1, s1, l1, pix, qoff;
int2 block_start, ib;
float4 col0, col1;
block_size = d16[0];
bits_per_block = d16[1];
quincunx = d8[5];
bits_ch = d8[6];
bits_cs = d8[7];
bits_cl = d8[8];
bits_per_pixel = d8[9];
qoff = (i.y&1) * quincunx * (block_size>>1);
block_start.y = (i.y / block_size);
block_start.x = (i.x + qoff) / block_size;
linew0 = idiv_ceil(im_dim.x, block_size);
linew1 = idiv_ceil(im_dim.x + quincunx*(block_size>>1), block_size);
line_count0 = block_start.y+1 >> 1;
line_count1 = block_start.y >> 1;
di = line_count0*linew0 + line_count1*linew1;
di += block_start.x;
di = di*block_size + blocks_start;
block_start *= block_size;
block_start.x -= qoff;
ib = i - block_start;
h0 = get_bits_in_stream_inc(d8, &di, bits_ch);
s0 = get_bits_in_stream_inc(d8, &di, bits_cs);
l0 = get_bits_in_stream_inc(d8, &di, bits_cl);
h1 = get_bits_in_stream_inc(d8, &di, bits_ch);
s1 = get_bits_in_stream_inc(d8, &di, bits_cs);
l1 = get_bits_in_stream_inc(d8, &di, bits_cl);
di += (ib.y*block_size + ib.x) * bits_per_pixel;
pix = get_bits_in_stream(d8, di, bits_per_pixel);
col0 = compr_hsl_to_float4(compr_hsl_to_float3(bits_ch, bits_cs, bits_cl, h0, s0, l0));
col1 = compr_hsl_to_float4(compr_hsl_to_float3(bits_ch, bits_cs, bits_cl, h1, s1, l1));
pv = mix(col0, col1, convert_float(pix) / bits_to_mul(bits_per_pixel));
return pv;
}
float4 read_fmt_pixel(const int fmt, global uchar *im, int2 im_dim, int2 i)
{
switch (fmt)
{
case 20:
return read_compressed_texture1_pixel(im, im_dim, i);
}
return 0.f;
}
float calc_flattop_weight(float2 pif, float2 i, float2 knee, float2 slope, float2 pscale)
{
float2 d, w;
d = fabs(pif - i);
d = max(d, knee);
w = slope * (d - pscale);
return w.x * w.y;
}
float4 image_filter_flattop(global float4 *im, int2 im_dim, const int fmt, float2 pif, float2 pscale, float2 slope)
{
float4 pv = 0.f;
float2 knee, i, start, end;
knee = 0.5f - fabs(fmod(pscale, 1.f) - 0.5f);
start = max(0.f, ceil(pif - pscale));
end = min(convert_float2(im_dim - 1), floor(pif + pscale));
for (i.y = start.y; i.y <= end.y; i.y+=1.f)
for (i.x = start.x; i.x <= end.x; i.x+=1.f)
pv += read_fmt_pixel(fmt, im, im_dim, convert_int2(i)) * calc_flattop_weight(pif, i, knee, slope, pscale);
return pv;
}
float4 blit_sprite_flattop(global uint *lei, global uchar *data_cl, float4 pv)
{
const int2 p = (int2) (get_global_id(0), get_global_id(1));
const float2 pf = convert_float2(p);
global float *lef = lei;
global float4 *im;
int2 im_dim;
int fmt;
float2 pscale, pos, pif, slope;
im = (global float4 *) &data_cl[lei[0]+(lei[1]<<32)];
im_dim.x = lei[2];
im_dim.y = lei[3];
pscale.x = lef[4];
pscale.y = lef[5];
pos.x = lef[6];
pos.y = lef[7];
fmt = lei[8];
slope.x = lef[9];
slope.y = lef[10];
pif = pscale * (pf + pos);
pscale = max(1.f, pscale);
pv += image_filter_flattop(im, im_dim, fmt, pif, pscale, slope);
return pv;
}
float4 draw_queue(global float *df, global int *poslist, global int *entrylist, global uchar *data_cl, const int sector_w, const int sector_size)
{
const int2 p = (int2) (get_global_id(0), get_global_id(1));
const int sec = (p.y >> sector_size) * sector_w + (p.x >> sector_size);
global int *di = (global int *) df;
int i, eli, entry_count, qi;
float4 pv = 0.f;
int brlvl = 0;
float4 br[4];
eli = poslist[sec];
if (eli < 0)
return pv;
entry_count = entrylist[eli];
for (i=0; i < entry_count; i++)
{
qi = entrylist[eli + i + 1];
pv = blit_sprite_flattop(&df[qi+1], data_cl, pv);
}
return pv;
}
kernel void draw_queue_srgb_kernel(const ulong df_index, const ulong poslist_index, const ulong entrylist_index, global uchar *data_cl, write_only image2d_t srgb, const int sector_w, const int sector_size, const int randseed)
{
const int2 p = (int2) (get_global_id(0), get_global_id(1));
const int fbi = p.y * get_global_size(0) + p.x;
float4 pv;
global float *df = &data_cl[df_index];
global int *poslist = &data_cl[poslist_index];
global int *entrylist = &data_cl[entrylist_index];
pv = draw_queue(df, poslist, entrylist, data_cl, sector_w, sector_size);
write_imagef(srgb, p, pv);
}

Outcomes