Archives Discussions

yours3lf · ‎02-02-2012

Hi,

I'm writing a deferred renderer using the tile based deferred shading approach. I've written my kernel to compute the lighting, but it freezes my video output.

I guess the video card is overloaded, because only the video output freezes, if I listen to some music in the background, then it keeps playing.

If I remove the atomic operations from the kernel, then it runs fine, but the lights aren't culled this way.

so here's the kernel:

const float far = -10000.0f; //far plane distance
const float near = -1.0f; //near plane distance
const sampler_t the_sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
const float cutoff = 0.25f; //0.005f
const int attenuation_type = 0; //linear or full attenuation?
float my_abs(float var) //these floating point operations aren't supported in opencl 1.1
{
  if(var < 0)
  {
    return -var;
  }
  else
  {
    return var;
  }
}
float my_mix(float x, float y, float weigth)
{
  return x * (1.0f - weigth) + y * weigth;
}
float3 my_reflect(float3 incident, float3 normal)
{
  return incident - 2.0 * dot(normal, incident) * normal;
}
float3 decode_normals_spheremap(float4 n) //decode normals from spheremap encoding
{
  float4 nn = n * (float4)(2.0, 2.0, 0.0, 0.0) + (float4)(-1.0, -1.0, 1.0, -1.0);
  float l = dot(nn.xyz, -nn.xyw);
  nn.z = l;
  nn.xy *= sqrt(l);
  return nn.xyz * 2.0 + (float3)(0.0, 0.0, -1.0);
}
float3 decode_linear_depth(float4 linear_depth, float4 position) //decode linear depth into view space position
{
  return (float3)(position.xy * (far / position.z), far) * linear_depth.x;
}
__kernel void main(__read_only image2d_t albedo, //diffuse surface color from the g-buffer
           __read_only image2d_t normals,  //normals encoded using spheremap encoding
           __read_only image2d_t depth, //linear depth
           __write_only image2d_t result, //the output buffer that stores lighting data
           __global const float* far_plane, //the lower left and upper right corners of the far plane
           __global const float* in_view_pos, //view space camera position
           __global const float* in_lights, //1024 lights {light pos[3], diffuse_color[3], radius[1], specular intensity[1] }
           __global const float* in_num_of_lights, //number of incoming lights (1024)
           __global const float* in_projection_matrix) //the projection matrix is used for frustum culling
{
  /*
   * Per pixel calculations (global)
   */
  int2 coords = (int2)(get_global_id(0), get_global_id(1));
  float4 raw_albedo = read_imagef(albedo, the_sampler, coords);
  float4 raw_normal = read_imagef(normals, the_sampler, coords); //this will store the decoded normals
  if(length(raw_normal.xy) == 0.0f)
  {
    write_imagef(result, coords, raw_albedo);
    return;
  }
  int2 global_size = (int2)(get_global_size(0), get_global_size(1));
  float4 ll, ur;
  float4 raw_depth = read_imagef(depth, the_sampler, coords); //this will store the decoded view space position
  ll = (float4)(far_plane[0], far_plane[1], far_plane[2], 1.0f);
  ur = (float4)(far_plane[3], far_plane[4], far_plane[5], 1.0f);
  float2 texel = (float2)((float)(coords.x) / (float)(global_size.x), (float)(coords.y) / (float)(global_size.y)); //texture coordinate [0...1] for input processing
  raw_depth.xyz = decode_linear_depth(raw_depth, (float4)(my_mix(ll.x, ur.x, texel.x), my_mix(ll.y, ur.y, texel.y), ll.z, 1.0f));
  raw_normal.xyz = decode_normals_spheremap(raw_normal);
  float4 out_color = (float4)(0.0f); //this will store the resulting color
  out_color.w = 1.0f;
  /*
   * Per workgroup (tile) calculations (local)
   */
  __local float3 view_pos; //this will store the view space position (uniform among the workgroups, but stored as local for speedup)
  __local int num_of_lights; //num of lights (same here)
  __local int2 local_coords;
  local_coords = (int2)(get_local_id(0), get_local_id(1));
  __local int2 local_size;
  local_size = (int2)(get_local_size(0), get_local_size(1));
  __local int workgroup_index;
  workgroup_index = local_coords.y * local_size.x + local_coords.x;
  __local int tile_lights[1024]; //index of the lights visible per tile
  __local int num_of_tile_lights; //number of lights per tile
  __local float2 tile_scale; //used for calculating frustum culling, taken from Intel's sample
  __local float2 tile_bias;
  __local float4 column_1;
  __local float4 column_2;
  __local float4 column_4;
  __local float4 frustum_planes[6];
  if(workgroup_index == 0)
  {
    view_pos = vload3(0, in_view_pos);
    num_of_lights = (int)in_num_of_lights[0];
    num_of_lights = 32;
    num_of_tile_lights = 0;
    //I'm not sure if the maths here is correct due to OpenGL and DirectX using different matrices, but this shouldnt matter
    tile_scale = (float2)(global_size.x, global_size.y) * (1.0f / (float)(2.0f * local_size.x));
    tile_bias = tile_scale - (float2)(local_coords.x, local_coords.y);
    column_1 = (float4)(in_projection_matrix[5] * tile_scale.x, 0.0f, tile_bias.x, 0.0f);
    column_2 = (float4)(0.0f, -in_projection_matrix[10] * tile_scale.y, tile_bias.y, 0.0f);
    column_4 = (float4)(0.0f, 0.0f, 1.0f, 0.0f);
    frustum_planes[0] = column_4 - column_1;
    frustum_planes[1] = column_4 + column_1;
    frustum_planes[2] = column_4 - column_2;
    frustum_planes[3] = column_4 + column_2;
    frustum_planes[4] = (float4)(0.0f, 0.0f, -1.0f, near);
    frustum_planes[5] = (float4)(0.0f, 0.0f, 1.0f, far);
    for(int c = 0; c < 4; c++) //normalize frustum plane normals
    {
      frustum_planes.xyz *= 1.0f / length(frustum_planes.xyz);
    }
  }
  barrier(CLK_LOCAL_MEM_FENCE);
  for(int c = workgroup_index; c < num_of_lights; c += local_size.x * local_size.y) //cull each light per tile, each thread in a tile processes one light
  {
    bool in_frustum = true;
    float attenuation_end = 0.0f;
    if(attenuation_type == 0)
    {
      attenuation_end = (float)(in_lights[c * 8 + 6]) / (float)(cutoff); //radius / cutoff
    }
    else
    {
      attenuation_end = (float)(in_lights[c * 8 + 6]); //radius
    }
    for(int d = 0; d < 6; d++) //cull each light based on the distance where it will shine and the frustum defined by the tile
    {
      float e = dot(frustum_planes, (float4)(in_lights[c * 8 + 0], in_lights[c * 8 + 1], in_lights[c * 8 + 2], 1.0f));
      in_frustum = in_frustum && (e >= -attenuation_end);
    }
    if(in_frustum) //if the light is in the frustum, then store its index (if I comment this out, the kernel runs, but doesn't cull lights)
    {
      int index = atomic_inc(&num_of_tile_lights);
      tile_lights[index] = c;
    }
  }
  barrier(CLK_LOCAL_MEM_FENCE);
  /*
   * Per light calculations
   */
  for(int c = 0; c < num_of_tile_lights; c++) //draw each light per tile
  {
      int index = tile_lights; //get back the light index
      float3 light_pos = (float3)(in_lights[index * 8 + 0], in_lights[index * 8 + 1], in_lights[index * 8 + 2]); //gather light data using the index
      float light_radius = in_lights[index * 8 + 6];
      //calculate blinn-phong lighting with custom attenuation
      float3 light_dir = light_pos - raw_depth.xyz;
      float distance = length(light_dir);
      light_dir /= distance;
      float coeff, attenuation;
      if(attenuation_type == 0)
      {
    coeff = max(distance - light_radius, 0.0f) / light_radius + 1.0f;
    attenuation = max((1.0f / (coeff * coeff) - cutoff) / (1.0f - cutoff), 0.0f);
      }
      else
      {
    attenuation = (light_radius - distance) / (light_radius * 0.01f) * 0.01f;
      }
      if(attenuation > 0.0f)
      {
    float3 light_diffuse_color = (float3)(in_lights[index * 8 + 3], in_lights[index * 8 + 4], in_lights[index * 8 + 5]);
    float light_specular_power = (float)in_lights[index * 8 + 3];
    float3 view_dir = normalize(view_pos - raw_depth.xyz);
    
    float3 half_vector = (light_dir + view_dir) * 0.5f;
    float n_dot_l = max(dot(raw_normal.xyz, light_dir), 0.0f);
    out_color.xyz += raw_albedo.xyz * light_diffuse_color * n_dot_l * attenuation;
    float n_dot_h = pow(max(dot(raw_normal.xyz, half_vector), 0.0f), light_specular_power);
    out_color.xyz += light_diffuse_color * n_dot_h * attenuation;
      }
  }
  write_imagef(result, coords, out_color); //write the calculated light data to the result buffer (texture)
}

Any ideas what going wrong?

Best regards,

Yours3!f

MicahVillmow · ‎02-02-2012

This kernel violates the OpenCL specification.

All work-items in a work-group must hit a barrier. However at lines 61-65, you have a conditional data dependent early exit, but a barrier after the early exit at line 137. This produces undefined behavior on the hardware, including hardware hangs.

View solution in original post

MicahVillmow · ‎02-02-2012

This kernel violates the OpenCL specification.

All work-items in a work-group must hit a barrier. However at lines 61-65, you have a conditional data dependent early exit, but a barrier after the early exit at line 137. This produces undefined behavior on the hardware, including hardware hangs.

yours3lf · ‎02-02-2012

thanks MicahVillmow for the quick reply!

I tried putting a barrier(CLK_LOCAL_MEM_FENCE); after 61-65 but it didn't solve the problem.

MicahVillmow · ‎02-02-2012

yours3lf,

Yeah, because now only the threads that leave early are hitting that barrier and not the other barrier. The only way to fix this is re-organizing your algorithm so that ALL threads hit every barrier.

So instead of doing

if (a) return

....

barrier

...

barrier

...

Do:

if (!a) {

}

barrier

if (!a) {

}

barrier

if (!a) {

}

yours3lf · ‎02-02-2012

ok, so I removed the early rejection just to see if it's working, and it didn't freeze (and I got 70FPS, hurray!)

but I still don't understand how should I implement the barriers and conditionals

so that's clear that doing this is wrong because the early rejected threads still wouldn't reach the barrier:

[...]
if(length(raw_normal.xy) == 0.0f)
  {
    write_imagef(result, coords, raw_albedo);
    return;
  }
  barrier(CLK_LOCAL_MEM_FENCE); //threads entering the if conditional are not going to reach this line, because of the return instruction
[...]

so I tried to reorganize the way you suggested, but it doesn't solve anything:

[...]
if(length(raw_normal.xy) != 0.0f)
{
  //logically here would come the lighting part, but then the rejected threads wouldn't reach the barriers at the lighting part...
}
barrier(CLK_LOCAL_MEM_FENCE); //this way the conditional is useless, isn't it?
[...]

or I must have misunderstood something...

MicahVillmow · ‎02-02-2012

What you need is something like this:

__kernel void main(__read_only image2d_t albedo, //diffuse surface color from the g-buffer
__read_only image2d_t normals, //normals encoded using spheremap encoding
__read_only image2d_t depth, //linear depth
__write_only image2d_t result, //the output buffer that stores lighting data
__global const float* far_plane, //the lower left and upper right corners of the far plane
__global const float* in_view_pos, //view space camera position
__global const float* in_lights, //1024 lights {light pos[3], diffuse_color[3], radius[1], specular intensity[1] }
__global const float* in_num_of_lights, //number of incoming lights (1024)
__global const float* in_projection_matrix) //the projection matrix is used for frustum culling
{
/*
* Per pixel calculations (global)
*/
int2 coords = (int2)(get_global_id(0), get_global_id(1));
float4 raw_albedo = read_imagef(albedo, the_sampler, coords);
float4 raw_normal = read_imagef(normals, the_sampler, coords); //this will store the decoded normals
float4 out_color = (float4)(0.0f); //this will store the resulting color
if(length(raw_normal.xy) == 0.0f)
{
out_color = raw_albedo;
} else {
int2 global_size = (int2)(get_global_size(0), get_global_size(1));
float4 ll, ur;
float4 raw_depth = read_imagef(depth, the_sampler, coords); //this will store the decoded view space position
ll = (float4)(far_plane[0], far_plane[1], far_plane[2], 1.0f);
ur = (float4)(far_plane[3], far_plane[4], far_plane[5], 1.0f);
float2 texel = (float2)((float)(coords.x) / (float)(global_size.x), (float)(coords.y) / (float)(global_size.y)); //texture coordinate [0...1] for input processing
raw_depth.xyz = decode_linear_depth(raw_depth, (float4)(my_mix(ll.x, ur.x, texel.x), my_mix(ll.y, ur.y, texel.y), ll.z, 1.0f));
raw_normal.xyz = decode_normals_spheremap(raw_normal);
out_color.w = 1.0f;
/*
* Per workgroup (tile) calculations (local)
*/
__local float3 view_pos; //this will store the view space position (uniform among the workgroups, but stored as local for speedup)
__local int num_of_lights; //num of lights (same here)
__local int2 local_coords;
local_coords = (int2)(get_local_id(0), get_local_id(1));
__local int2 local_size;
local_size = (int2)(get_local_size(0), get_local_size(1));
__local int workgroup_index;
workgroup_index = local_coords.y * local_size.x + local_coords.x;
__local int tile_lights[1024]; //index of the lights visible per tile
__local int num_of_tile_lights; //number of lights per tile
__local float2 tile_scale; //used for calculating frustum culling, taken from Intel's sample
__local float2 tile_bias;
__local float4 column_1;
__local float4 column_2;
__local float4 column_4;
__local float4 frustum_planes[6];
if(workgroup_index == 0)
{
view_pos = vload3(0, in_view_pos);
num_of_lights = (int)in_num_of_lights[0];
num_of_lights = 32;
num_of_tile_lights = 0;
//I'm not sure if the maths here is correct due to OpenGL and DirectX using different matrices, but this shouldnt matter
tile_scale = (float2)(global_size.x, global_size.y) * (1.0f / (float)(2.0f * local_size.x));
tile_bias = tile_scale - (float2)(local_coords.x, local_coords.y);
column_1 = (float4)(in_projection_matrix[5] * tile_scale.x, 0.0f, tile_bias.x, 0.0f);
column_2 = (float4)(0.0f, -in_projection_matrix[10] * tile_scale.y, tile_bias.y, 0.0f);
column_4 = (float4)(0.0f, 0.0f, 1.0f, 0.0f);
frustum_planes[0] = column_4 - column_1;
frustum_planes[1] = column_4 + column_1;
frustum_planes[2] = column_4 - column_2;
frustum_planes[3] = column_4 + column_2;
frustum_planes[4] = (float4)(0.0f, 0.0f, -1.0f, near);
frustum_planes[5] = (float4)(0.0f, 0.0f, 1.0f, far);
for(int c = 0; c < 4; c++) //normalize frustum plane normals
{
frustum_planes.xyz *= 1.0f / length(frustum_planes.xyz);
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (length(raw_normal.xy != 0.0f) {
for(int c = workgroup_index; c < num_of_lights; c += local_size.x * local_size.y) //cull each light per tile, each thread in a tile processes one light
{
bool in_frustum = true;
float attenuation_end = 0.0f;
if(attenuation_type == 0)
{
attenuation_end = (float)(in_lights[c * 8 + 6]) / (float)(cutoff); //radius / cutoff
}
else
{
attenuation_end = (float)(in_lights[c * 8 + 6]); //radius
}
for(int d = 0; d < 6; d++) //cull each light based on the distance where it will shine and the frustum defined by the tile
{
float e = dot(frustum_planes, (float4)(in_lights[c * 8 + 0], in_lights[c * 8 + 1], in_lights[c * 8 + 2], 1.0f));
in_frustum = in_frustum && (e >= -attenuation_end);
}
if(in_frustum) //if the light is in the frustum, then store its index (if I comment this out, the kernel runs, but doesn't cull lights)
{
int index = atomic_inc(&num_of_tile_lights);
tile_lights[index] = c;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (length(raw_normal.xy != 0.0f) {
/*
* Per light calculations
*/
for(int c = 0; c < num_of_tile_lights; c++) //draw each light per tile
{
int index = tile_lights; //get back the light index
float3 light_pos = (float3)(in_lights[index * 8 + 0], in_lights[index * 8 + 1], in_lights[index * 8 + 2]); //gather light data using the index
float light_radius = in_lights[index * 8 + 6];
//calculate blinn-phong lighting with custom attenuation
float3 light_dir = light_pos - raw_depth.xyz;
float distance = length(light_dir);
light_dir /= distance;
float coeff, attenuation;
if(attenuation_type == 0)
{
coeff = max(distance - light_radius, 0.0f) / light_radius + 1.0f;
attenuation = max((1.0f / (coeff * coeff) - cutoff) / (1.0f - cutoff), 0.0f);
}
else
{
attenuation = (light_radius - distance) / (light_radius * 0.01f) * 0.01f;
}
if(attenuation > 0.0f)
{
float3 light_diffuse_color = (float3)(in_lights[index * 8 + 3], in_lights[index * 8 + 4], in_lights[index * 8 + 5]);
float light_specular_power = (float)in_lights[index * 8 + 3];
float3 view_dir = normalize(view_pos - raw_depth.xyz);
float3 half_vector = (light_dir + view_dir) * 0.5f;
float n_dot_l = max(dot(raw_normal.xyz, light_dir), 0.0f);
out_color.xyz += raw_albedo.xyz * light_diffuse_color * n_dot_l * attenuation;
float n_dot_h = pow(max(dot(raw_normal.xyz, half_vector), 0.0f), light_specular_power);
out_color.xyz += light_diffuse_color * n_dot_h * attenuation;
}
}
}
write_imagef(result, coords, out_color); //write the calculated light data to the result buffer (texture)
}

yours3lf · ‎02-02-2012

thanks, now I understand!

I tried it out, and now it works

I shall now proceed to make culling actually work, so that lights are visible xD

Thanks again for the help!!!

Archives Discussions

tile based deferred shading kernel freezes video output