Hi,
I'm writing a deferred renderer using the tile based deferred shading approach. I've written my kernel to compute the lighting, but it freezes my video output.
I guess the video card is overloaded, because only the video output freezes, if I listen to some music in the background, then it keeps playing.
If I remove the atomic operations from the kernel, then it runs fine, but the lights aren't culled this way.
so here's the kernel:
const float far = -10000.0f; //far plane distance
const float near = -1.0f; //near plane distance
const sampler_t the_sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
const float cutoff = 0.25f; //0.005f
const int attenuation_type = 0; //linear or full attenuation?
float my_abs(float var) //these floating point operations aren't supported in opencl 1.1
{
if(var < 0)
{
return -var;
}
else
{
return var;
}
}
float my_mix(float x, float y, float weigth)
{
return x * (1.0f - weigth) + y * weigth;
}
float3 my_reflect(float3 incident, float3 normal)
{
return incident - 2.0 * dot(normal, incident) * normal;
}
float3 decode_normals_spheremap(float4 n) //decode normals from spheremap encoding
{
float4 nn = n * (float4)(2.0, 2.0, 0.0, 0.0) + (float4)(-1.0, -1.0, 1.0, -1.0);
float l = dot(nn.xyz, -nn.xyw);
nn.z = l;
nn.xy *= sqrt(l);
return nn.xyz * 2.0 + (float3)(0.0, 0.0, -1.0);
}
float3 decode_linear_depth(float4 linear_depth, float4 position) //decode linear depth into view space position
{
return (float3)(position.xy * (far / position.z), far) * linear_depth.x;
}
__kernel void main(__read_only image2d_t albedo, //diffuse surface color from the g-buffer
__read_only image2d_t normals, //normals encoded using spheremap encoding
__read_only image2d_t depth, //linear depth
__write_only image2d_t result, //the output buffer that stores lighting data
__global const float* far_plane, //the lower left and upper right corners of the far plane
__global const float* in_view_pos, //view space camera position
__global const float* in_lights, //1024 lights {light pos[3], diffuse_color[3], radius[1], specular intensity[1] }
__global const float* in_num_of_lights, //number of incoming lights (1024)
__global const float* in_projection_matrix) //the projection matrix is used for frustum culling
{
/*
* Per pixel calculations (global)
*/
int2 coords = (int2)(get_global_id(0), get_global_id(1));
float4 raw_albedo = read_imagef(albedo, the_sampler, coords);
float4 raw_normal = read_imagef(normals, the_sampler, coords); //this will store the decoded normals
if(length(raw_normal.xy) == 0.0f)
{
write_imagef(result, coords, raw_albedo);
return;
}
int2 global_size = (int2)(get_global_size(0), get_global_size(1));
float4 ll, ur;
float4 raw_depth = read_imagef(depth, the_sampler, coords); //this will store the decoded view space position
ll = (float4)(far_plane[0], far_plane[1], far_plane[2], 1.0f);
ur = (float4)(far_plane[3], far_plane[4], far_plane[5], 1.0f);
float2 texel = (float2)((float)(coords.x) / (float)(global_size.x), (float)(coords.y) / (float)(global_size.y)); //texture coordinate [0...1] for input processing
raw_depth.xyz = decode_linear_depth(raw_depth, (float4)(my_mix(ll.x, ur.x, texel.x), my_mix(ll.y, ur.y, texel.y), ll.z, 1.0f));
raw_normal.xyz = decode_normals_spheremap(raw_normal);
float4 out_color = (float4)(0.0f); //this will store the resulting color
out_color.w = 1.0f;
/*
* Per workgroup (tile) calculations (local)
*/
__local float3 view_pos; //this will store the view space position (uniform among the workgroups, but stored as local for speedup)
__local int num_of_lights; //num of lights (same here)
__local int2 local_coords;
local_coords = (int2)(get_local_id(0), get_local_id(1));
__local int2 local_size;
local_size = (int2)(get_local_size(0), get_local_size(1));
__local int workgroup_index;
workgroup_index = local_coords.y * local_size.x + local_coords.x;
__local int tile_lights[1024]; //index of the lights visible per tile
__local int num_of_tile_lights; //number of lights per tile
__local float2 tile_scale; //used for calculating frustum culling, taken from Intel's sample
__local float2 tile_bias;
__local float4 column_1;
__local float4 column_2;
__local float4 column_4;
__local float4 frustum_planes[6];
if(workgroup_index == 0)
{
view_pos = vload3(0, in_view_pos);
num_of_lights = (int)in_num_of_lights[0];
num_of_lights = 32;
num_of_tile_lights = 0;
//I'm not sure if the maths here is correct due to OpenGL and DirectX using different matrices, but this shouldnt matter
tile_scale = (float2)(global_size.x, global_size.y) * (1.0f / (float)(2.0f * local_size.x));
tile_bias = tile_scale - (float2)(local_coords.x, local_coords.y);
column_1 = (float4)(in_projection_matrix[5] * tile_scale.x, 0.0f, tile_bias.x, 0.0f);
column_2 = (float4)(0.0f, -in_projection_matrix[10] * tile_scale.y, tile_bias.y, 0.0f);
column_4 = (float4)(0.0f, 0.0f, 1.0f, 0.0f);
frustum_planes[0] = column_4 - column_1;
frustum_planes[1] = column_4 + column_1;
frustum_planes[2] = column_4 - column_2;
frustum_planes[3] = column_4 + column_2;
frustum_planes[4] = (float4)(0.0f, 0.0f, -1.0f, near);
frustum_planes[5] = (float4)(0.0f, 0.0f, 1.0f, far);
for(int c = 0; c < 4; c++) //normalize frustum plane normals
{
frustum_planes
.xyz *= 1.0f / length(frustum_planes .xyz); }
}
barrier(CLK_LOCAL_MEM_FENCE);
for(int c = workgroup_index; c < num_of_lights; c += local_size.x * local_size.y) //cull each light per tile, each thread in a tile processes one light
{
bool in_frustum = true;
float attenuation_end = 0.0f;
if(attenuation_type == 0)
{
attenuation_end = (float)(in_lights[c * 8 + 6]) / (float)(cutoff); //radius / cutoff
}
else
{
attenuation_end = (float)(in_lights[c * 8 + 6]); //radius
}
for(int d = 0; d < 6; d++) //cull each light based on the distance where it will shine and the frustum defined by the tile
{
float e = dot(frustum_planes
, (float4)(in_lights[c * 8 + 0], in_lights[c * 8 + 1], in_lights[c * 8 + 2], 1.0f)); in_frustum = in_frustum && (e >= -attenuation_end);
}
if(in_frustum) //if the light is in the frustum, then store its index (if I comment this out, the kernel runs, but doesn't cull lights)
{
int index = atomic_inc(&num_of_tile_lights);
tile_lights[index] = c;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
/*
* Per light calculations
*/
for(int c = 0; c < num_of_tile_lights; c++) //draw each light per tile
{
int index = tile_lights
; //get back the light index float3 light_pos = (float3)(in_lights[index * 8 + 0], in_lights[index * 8 + 1], in_lights[index * 8 + 2]); //gather light data using the index
float light_radius = in_lights[index * 8 + 6];
//calculate blinn-phong lighting with custom attenuation
float3 light_dir = light_pos - raw_depth.xyz;
float distance = length(light_dir);
light_dir /= distance;
float coeff, attenuation;
if(attenuation_type == 0)
{
coeff = max(distance - light_radius, 0.0f) / light_radius + 1.0f;
attenuation = max((1.0f / (coeff * coeff) - cutoff) / (1.0f - cutoff), 0.0f);
}
else
{
attenuation = (light_radius - distance) / (light_radius * 0.01f) * 0.01f;
}
if(attenuation > 0.0f)
{
float3 light_diffuse_color = (float3)(in_lights[index * 8 + 3], in_lights[index * 8 + 4], in_lights[index * 8 + 5]);
float light_specular_power = (float)in_lights[index * 8 + 3];
float3 view_dir = normalize(view_pos - raw_depth.xyz);
float3 half_vector = (light_dir + view_dir) * 0.5f;
float n_dot_l = max(dot(raw_normal.xyz, light_dir), 0.0f);
out_color.xyz += raw_albedo.xyz * light_diffuse_color * n_dot_l * attenuation;
float n_dot_h = pow(max(dot(raw_normal.xyz, half_vector), 0.0f), light_specular_power);
out_color.xyz += light_diffuse_color * n_dot_h * attenuation;
}
}
write_imagef(result, coords, out_color); //write the calculated light data to the result buffer (texture)
}
Any ideas what going wrong?
Best regards,
Yours3!f
Solved! Go to Solution.
This kernel violates the OpenCL specification.
All work-items in a work-group must hit a barrier. However at lines 61-65, you have a conditional data dependent early exit, but a barrier after the early exit at line 137. This produces undefined behavior on the hardware, including hardware hangs.
This kernel violates the OpenCL specification.
All work-items in a work-group must hit a barrier. However at lines 61-65, you have a conditional data dependent early exit, but a barrier after the early exit at line 137. This produces undefined behavior on the hardware, including hardware hangs.
thanks MicahVillmow for the quick reply!
I tried putting a barrier(CLK_LOCAL_MEM_FENCE); after 61-65 but it didn't solve the problem.
yours3lf,
Yeah, because now only the threads that leave early are hitting that barrier and not the other barrier. The only way to fix this is re-organizing your algorithm so that ALL threads hit every barrier.
So instead of doing
if (a) return
....
barrier
...
barrier
...
Do:
if (!a) {
}
barrier
if (!a) {
}
barrier
if (!a) {
}
ok, so I removed the early rejection just to see if it's working, and it didn't freeze (and I got 70FPS, hurray!)
but I still don't understand how should I implement the barriers and conditionals
so that's clear that doing this is wrong because the early rejected threads still wouldn't reach the barrier:
[...]
if(length(raw_normal.xy) == 0.0f)
{
write_imagef(result, coords, raw_albedo);
return;
}
barrier(CLK_LOCAL_MEM_FENCE); //threads entering the if conditional are not going to reach this line, because of the return instruction
[...]
so I tried to reorganize the way you suggested, but it doesn't solve anything:
[...]
if(length(raw_normal.xy) != 0.0f)
{
//logically here would come the lighting part, but then the rejected threads wouldn't reach the barriers at the lighting part...
}
barrier(CLK_LOCAL_MEM_FENCE); //this way the conditional is useless, isn't it?
[...]
or I must have misunderstood something...
What you need is something like this:
thanks, now I understand!
I tried it out, and now it works
I shall now proceed to make culling actually work, so that lights are visible xD
Thanks again for the help!!!