AnsweredAssumed Answered

tile based deferred shading kernel freezes video output

Question asked by yours3lf on Feb 2, 2012
Latest reply on Feb 2, 2012 by yours3lf

Hi,

 

I'm writing a deferred renderer using the tile based deferred shading approach. I've written my kernel to compute the lighting, but it freezes my video output.

I guess the video card is overloaded, because only the video output freezes, if I listen to some music in the background, then it keeps playing.

 

If I remove the atomic operations from the kernel, then it runs fine, but the lights aren't culled this way.

 

so here's the kernel:

const float far = -10000.0f; //far plane distance
const float near = -1.0f; //near plane distance
const sampler_t the_sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
const float cutoff = 0.25f; //0.005f
const int attenuation_type = 0; //linear or full attenuation?

float my_abs(float var) //these floating point operations aren't supported in opencl 1.1
{
  if(var < 0)
  {
    return -var;
  }
  else
  {
    return var;
  }
}

float my_mix(float x, float y, float weigth)
{
  return x * (1.0f - weigth) + y * weigth;
}

float3 my_reflect(float3 incident, float3 normal)
{
  return incident - 2.0 * dot(normal, incident) * normal;
}

float3 decode_normals_spheremap(float4 n) //decode normals from spheremap encoding
{
  float4 nn = n * (float4)(2.0, 2.0, 0.0, 0.0) + (float4)(-1.0, -1.0, 1.0, -1.0);
  float l = dot(nn.xyz, -nn.xyw);
  nn.z = l;
  nn.xy *= sqrt(l);
  return nn.xyz * 2.0 + (float3)(0.0, 0.0, -1.0);
}

float3 decode_linear_depth(float4 linear_depth, float4 position) //decode linear depth into view space position
{
  return (float3)(position.xy * (far / position.z), far) * linear_depth.x;
}

__kernel void main(__read_only image2d_t albedo, //diffuse surface color from the g-buffer
           __read_only image2d_t normals,  //normals encoded using spheremap encoding
           __read_only image2d_t depth, //linear depth
           __write_only image2d_t result, //the output buffer that stores lighting data
           __global const float* far_plane, //the lower left and upper right corners of the far plane
           __global const float* in_view_pos, //view space camera position
           __global const float* in_lights, //1024 lights {light pos[3], diffuse_color[3], radius[1], specular intensity[1] }
           __global const float* in_num_of_lights, //number of incoming lights (1024)
           __global const float* in_projection_matrix) //the projection matrix is used for frustum culling
{
  /*
   * Per pixel calculations (global)
   */

  int2 coords = (int2)(get_global_id(0), get_global_id(1));
  float4 raw_albedo = read_imagef(albedo, the_sampler, coords);
  float4 raw_normal = read_imagef(normals, the_sampler, coords); //this will store the decoded normals

  if(length(raw_normal.xy) == 0.0f)
  {
    write_imagef(result, coords, raw_albedo);
    return;
  }

  int2 global_size = (int2)(get_global_size(0), get_global_size(1));
  float4 ll, ur;

  float4 raw_depth = read_imagef(depth, the_sampler, coords); //this will store the decoded view space position

  ll = (float4)(far_plane[0], far_plane[1], far_plane[2], 1.0f);
  ur = (float4)(far_plane[3], far_plane[4], far_plane[5], 1.0f);

  float2 texel = (float2)((float)(coords.x) / (float)(global_size.x), (float)(coords.y) / (float)(global_size.y)); //texture coordinate [0...1] for input processing

  raw_depth.xyz = decode_linear_depth(raw_depth, (float4)(my_mix(ll.x, ur.x, texel.x), my_mix(ll.y, ur.y, texel.y), ll.z, 1.0f));
  raw_normal.xyz = decode_normals_spheremap(raw_normal);

  float4 out_color = (float4)(0.0f); //this will store the resulting color
  out_color.w = 1.0f;

  /*
   * Per workgroup (tile) calculations (local)
   */

  __local float3 view_pos; //this will store the view space position (uniform among the workgroups, but stored as local for speedup)
  __local int num_of_lights; //num of lights (same here)

  __local int2 local_coords;
  local_coords = (int2)(get_local_id(0), get_local_id(1));
  __local int2 local_size;
  local_size = (int2)(get_local_size(0), get_local_size(1));
  __local int workgroup_index;
  workgroup_index = local_coords.y * local_size.x + local_coords.x;

  __local int tile_lights[1024]; //index of the lights visible per tile
  __local int num_of_tile_lights; //number of lights per tile

  __local float2 tile_scale; //used for calculating frustum culling, taken from Intel's sample
  __local float2 tile_bias;

  __local float4 column_1;
  __local float4 column_2;
  __local float4 column_4;

  __local float4 frustum_planes[6];

  if(workgroup_index == 0)
  {
    view_pos = vload3(0, in_view_pos);
    num_of_lights = (int)in_num_of_lights[0];
    num_of_lights = 32;
    num_of_tile_lights = 0;

    //I'm not sure if the maths here is correct due to OpenGL and DirectX using different matrices, but this shouldnt matter
    tile_scale = (float2)(global_size.x, global_size.y) * (1.0f / (float)(2.0f * local_size.x));
    tile_bias = tile_scale - (float2)(local_coords.x, local_coords.y);

    column_1 = (float4)(in_projection_matrix[5] * tile_scale.x, 0.0f, tile_bias.x, 0.0f);
    column_2 = (float4)(0.0f, -in_projection_matrix[10] * tile_scale.y, tile_bias.y, 0.0f);
    column_4 = (float4)(0.0f, 0.0f, 1.0f, 0.0f);

    frustum_planes[0] = column_4 - column_1;
    frustum_planes[1] = column_4 + column_1;
    frustum_planes[2] = column_4 - column_2;
    frustum_planes[3] = column_4 + column_2;
    frustum_planes[4] = (float4)(0.0f, 0.0f, -1.0f, near);
    frustum_planes[5] = (float4)(0.0f, 0.0f, 1.0f, far);

    for(int c = 0; c < 4; c++) //normalize frustum plane normals
    {
      frustum_planes[c].xyz *= 1.0f / length(frustum_planes[c].xyz);
    }
  }

  barrier(CLK_LOCAL_MEM_FENCE);

  for(int c = workgroup_index; c < num_of_lights; c += local_size.x * local_size.y) //cull each light per tile, each thread in a tile processes one light
  {
    bool in_frustum = true;
    float attenuation_end = 0.0f;

    if(attenuation_type == 0)
    {
      attenuation_end = (float)(in_lights[c * 8 + 6]) / (float)(cutoff); //radius / cutoff
    }
    else
    {
      attenuation_end = (float)(in_lights[c * 8 + 6]); //radius
    }

    for(int d = 0; d < 6; d++) //cull each light based on the distance where it will shine and the frustum defined by the tile
    {
      float e = dot(frustum_planes[d], (float4)(in_lights[c * 8 + 0], in_lights[c * 8 + 1], in_lights[c * 8 + 2], 1.0f));
      in_frustum = in_frustum && (e >= -attenuation_end);
    }

    if(in_frustum) //if the light is in the frustum, then store its index (if I comment this out, the kernel runs, but doesn't cull lights)
    {
      int index = atomic_inc(&num_of_tile_lights);
      tile_lights[index] = c;
    }
  }

  barrier(CLK_LOCAL_MEM_FENCE);

  /*
   * Per light calculations
   */

  for(int c = 0; c < num_of_tile_lights; c++) //draw each light per tile
  {
      int index = tile_lights[c]; //get back the light index
      float3 light_pos = (float3)(in_lights[index * 8 + 0], in_lights[index * 8 + 1], in_lights[index * 8 + 2]); //gather light data using the index
      float light_radius = in_lights[index * 8 + 6];

      //calculate blinn-phong lighting with custom attenuation
      float3 light_dir = light_pos - raw_depth.xyz;
      float distance = length(light_dir);
      light_dir /= distance;

      float coeff, attenuation;

      if(attenuation_type == 0)
      {
    coeff = max(distance - light_radius, 0.0f) / light_radius + 1.0f;
    attenuation = max((1.0f / (coeff * coeff) - cutoff) / (1.0f - cutoff), 0.0f);
      }
      else
      {
    attenuation = (light_radius - distance) / (light_radius * 0.01f) * 0.01f;
      }

      if(attenuation > 0.0f)
      {
    float3 light_diffuse_color = (float3)(in_lights[index * 8 + 3], in_lights[index * 8 + 4], in_lights[index * 8 + 5]);
    float light_specular_power = (float)in_lights[index * 8 + 3];
    float3 view_dir = normalize(view_pos - raw_depth.xyz);
    
    float3 half_vector = (light_dir + view_dir) * 0.5f;
    float n_dot_l = max(dot(raw_normal.xyz, light_dir), 0.0f);

    out_color.xyz += raw_albedo.xyz * light_diffuse_color * n_dot_l * attenuation;
    float n_dot_h = pow(max(dot(raw_normal.xyz, half_vector), 0.0f), light_specular_power);
    out_color.xyz += light_diffuse_color * n_dot_h * attenuation;
      }
  }

  write_imagef(result, coords, out_color); //write the calculated light data to the result buffer (texture)
}

 

Any ideas what going wrong?

 

Best regards,

Yours3!f

Outcomes