cancel
Showing results for 
Search instead for 
Did you mean: 

Archives Discussions

yours3lf
Adept II

tile based deferred shading kernel freezes video output

Jump to solution

Hi,

I'm writing a deferred renderer using the tile based deferred shading approach. I've written my kernel to compute the lighting, but it freezes my video output.

I guess the video card is overloaded, because only the video output freezes, if I listen to some music in the background, then it keeps playing.

If I remove the atomic operations from the kernel, then it runs fine, but the lights aren't culled this way.

so here's the kernel:

const float far = -10000.0f; //far plane distance

const float near = -1.0f; //near plane distance

const sampler_t the_sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;

const float cutoff = 0.25f; //0.005f

const int attenuation_type = 0; //linear or full attenuation?

float my_abs(float var) //these floating point operations aren't supported in opencl 1.1

{

  if(var < 0)

  {

    return -var;

  }

  else

  {

    return var;

  }

}

float my_mix(float x, float y, float weigth)

{

  return x * (1.0f - weigth) + y * weigth;

}

float3 my_reflect(float3 incident, float3 normal)

{

  return incident - 2.0 * dot(normal, incident) * normal;

}

float3 decode_normals_spheremap(float4 n) //decode normals from spheremap encoding

{

  float4 nn = n * (float4)(2.0, 2.0, 0.0, 0.0) + (float4)(-1.0, -1.0, 1.0, -1.0);

  float l = dot(nn.xyz, -nn.xyw);

  nn.z = l;

  nn.xy *= sqrt(l);

  return nn.xyz * 2.0 + (float3)(0.0, 0.0, -1.0);

}

float3 decode_linear_depth(float4 linear_depth, float4 position) //decode linear depth into view space position

{

  return (float3)(position.xy * (far / position.z), far) * linear_depth.x;

}

__kernel void main(__read_only image2d_t albedo, //diffuse surface color from the g-buffer

           __read_only image2d_t normals,  //normals encoded using spheremap encoding

           __read_only image2d_t depth, //linear depth

           __write_only image2d_t result, //the output buffer that stores lighting data

           __global const float* far_plane, //the lower left and upper right corners of the far plane

           __global const float* in_view_pos, //view space camera position

           __global const float* in_lights, //1024 lights {light pos[3], diffuse_color[3], radius[1], specular intensity[1] }

           __global const float* in_num_of_lights, //number of incoming lights (1024)

           __global const float* in_projection_matrix) //the projection matrix is used for frustum culling

{

  /*

   * Per pixel calculations (global)

   */

  int2 coords = (int2)(get_global_id(0), get_global_id(1));

  float4 raw_albedo = read_imagef(albedo, the_sampler, coords);

  float4 raw_normal = read_imagef(normals, the_sampler, coords); //this will store the decoded normals

  if(length(raw_normal.xy) == 0.0f)

  {

    write_imagef(result, coords, raw_albedo);

    return;

  }

  int2 global_size = (int2)(get_global_size(0), get_global_size(1));

  float4 ll, ur;

  float4 raw_depth = read_imagef(depth, the_sampler, coords); //this will store the decoded view space position

  ll = (float4)(far_plane[0], far_plane[1], far_plane[2], 1.0f);

  ur = (float4)(far_plane[3], far_plane[4], far_plane[5], 1.0f);

  float2 texel = (float2)((float)(coords.x) / (float)(global_size.x), (float)(coords.y) / (float)(global_size.y)); //texture coordinate [0...1] for input processing

  raw_depth.xyz = decode_linear_depth(raw_depth, (float4)(my_mix(ll.x, ur.x, texel.x), my_mix(ll.y, ur.y, texel.y), ll.z, 1.0f));

  raw_normal.xyz = decode_normals_spheremap(raw_normal);

  float4 out_color = (float4)(0.0f); //this will store the resulting color

  out_color.w = 1.0f;

  /*

   * Per workgroup (tile) calculations (local)

   */

  __local float3 view_pos; //this will store the view space position (uniform among the workgroups, but stored as local for speedup)

  __local int num_of_lights; //num of lights (same here)

  __local int2 local_coords;

  local_coords = (int2)(get_local_id(0), get_local_id(1));

  __local int2 local_size;

  local_size = (int2)(get_local_size(0), get_local_size(1));

  __local int workgroup_index;

  workgroup_index = local_coords.y * local_size.x + local_coords.x;

  __local int tile_lights[1024]; //index of the lights visible per tile

  __local int num_of_tile_lights; //number of lights per tile

  __local float2 tile_scale; //used for calculating frustum culling, taken from Intel's sample

  __local float2 tile_bias;

  __local float4 column_1;

  __local float4 column_2;

  __local float4 column_4;

  __local float4 frustum_planes[6];

  if(workgroup_index == 0)

  {

    view_pos = vload3(0, in_view_pos);

    num_of_lights = (int)in_num_of_lights[0];

    num_of_lights = 32;

    num_of_tile_lights = 0;

    //I'm not sure if the maths here is correct due to OpenGL and DirectX using different matrices, but this shouldnt matter

    tile_scale = (float2)(global_size.x, global_size.y) * (1.0f / (float)(2.0f * local_size.x));

    tile_bias = tile_scale - (float2)(local_coords.x, local_coords.y);

    column_1 = (float4)(in_projection_matrix[5] * tile_scale.x, 0.0f, tile_bias.x, 0.0f);

    column_2 = (float4)(0.0f, -in_projection_matrix[10] * tile_scale.y, tile_bias.y, 0.0f);

    column_4 = (float4)(0.0f, 0.0f, 1.0f, 0.0f);

    frustum_planes[0] = column_4 - column_1;

    frustum_planes[1] = column_4 + column_1;

    frustum_planes[2] = column_4 - column_2;

    frustum_planes[3] = column_4 + column_2;

    frustum_planes[4] = (float4)(0.0f, 0.0f, -1.0f, near);

    frustum_planes[5] = (float4)(0.0f, 0.0f, 1.0f, far);

    for(int c = 0; c < 4; c++) //normalize frustum plane normals

    {

      frustum_planes.xyz *= 1.0f / length(frustum_planes.xyz);

    }

  }

  barrier(CLK_LOCAL_MEM_FENCE);

  for(int c = workgroup_index; c < num_of_lights; c += local_size.x * local_size.y) //cull each light per tile, each thread in a tile processes one light

  {

    bool in_frustum = true;

    float attenuation_end = 0.0f;

    if(attenuation_type == 0)

    {

      attenuation_end = (float)(in_lights[c * 8 + 6]) / (float)(cutoff); //radius / cutoff

    }

    else

    {

      attenuation_end = (float)(in_lights[c * 8 + 6]); //radius

    }

    for(int d = 0; d < 6; d++) //cull each light based on the distance where it will shine and the frustum defined by the tile

    {

      float e = dot(frustum_planes, (float4)(in_lights[c * 8 + 0], in_lights[c * 8 + 1], in_lights[c * 8 + 2], 1.0f));

      in_frustum = in_frustum && (e >= -attenuation_end);

    }

    if(in_frustum) //if the light is in the frustum, then store its index (if I comment this out, the kernel runs, but doesn't cull lights)

    {

      int index = atomic_inc(&num_of_tile_lights);

      tile_lights[index] = c;

    }

  }

  barrier(CLK_LOCAL_MEM_FENCE);

  /*

   * Per light calculations

   */

  for(int c = 0; c < num_of_tile_lights; c++) //draw each light per tile

  {

      int index = tile_lights; //get back the light index

      float3 light_pos = (float3)(in_lights[index * 8 + 0], in_lights[index * 8 + 1], in_lights[index * 8 + 2]); //gather light data using the index

      float light_radius = in_lights[index * 8 + 6];

      //calculate blinn-phong lighting with custom attenuation

      float3 light_dir = light_pos - raw_depth.xyz;

      float distance = length(light_dir);

      light_dir /= distance;

      float coeff, attenuation;

      if(attenuation_type == 0)

      {

    coeff = max(distance - light_radius, 0.0f) / light_radius + 1.0f;

    attenuation = max((1.0f / (coeff * coeff) - cutoff) / (1.0f - cutoff), 0.0f);

      }

      else

      {

    attenuation = (light_radius - distance) / (light_radius * 0.01f) * 0.01f;

      }

      if(attenuation > 0.0f)

      {

    float3 light_diffuse_color = (float3)(in_lights[index * 8 + 3], in_lights[index * 8 + 4], in_lights[index * 8 + 5]);

    float light_specular_power = (float)in_lights[index * 8 + 3];

    float3 view_dir = normalize(view_pos - raw_depth.xyz);

   

    float3 half_vector = (light_dir + view_dir) * 0.5f;

    float n_dot_l = max(dot(raw_normal.xyz, light_dir), 0.0f);

    out_color.xyz += raw_albedo.xyz * light_diffuse_color * n_dot_l * attenuation;

    float n_dot_h = pow(max(dot(raw_normal.xyz, half_vector), 0.0f), light_specular_power);

    out_color.xyz += light_diffuse_color * n_dot_h * attenuation;

      }

  }

  write_imagef(result, coords, out_color); //write the calculated light data to the result buffer (texture)

}

Any ideas what going wrong?

Best regards,

Yours3!f

0 Likes
1 Solution

Accepted Solutions
MicahVillmow
Staff
Staff

Re: tile based deferred shading kernel freezes video output

Jump to solution

This kernel violates the OpenCL specification.

All work-items in a work-group must hit a barrier. However at lines 61-65, you have a conditional data dependent early exit, but a barrier after the early exit at line 137. This produces undefined behavior on the hardware, including hardware hangs.

View solution in original post

6 Replies
MicahVillmow
Staff
Staff

Re: tile based deferred shading kernel freezes video output

Jump to solution

This kernel violates the OpenCL specification.

All work-items in a work-group must hit a barrier. However at lines 61-65, you have a conditional data dependent early exit, but a barrier after the early exit at line 137. This produces undefined behavior on the hardware, including hardware hangs.

View solution in original post

yours3lf
Adept II

Re: tile based deferred shading kernel freezes video output

Jump to solution

thanks MicahVillmow for the quick reply!

I tried putting a barrier(CLK_LOCAL_MEM_FENCE); after 61-65 but it didn't solve the problem.

0 Likes
MicahVillmow
Staff
Staff

Re: tile based deferred shading kernel freezes video output

Jump to solution

yours3lf,

Yeah, because now only the threads that leave early are hitting that barrier and not the other barrier. The only way to fix this is re-organizing your algorithm so that ALL threads hit every barrier.

So instead of doing

if (a) return

....

barrier

...

barrier

...

Do:

if (!a) {

}

barrier

if (!a) {

}

barrier

if (!a) {

}

yours3lf
Adept II

Re: tile based deferred shading kernel freezes video output

Jump to solution

ok, so I removed the early rejection just to see if it's working, and it didn't freeze (and I got 70FPS, hurray!)

but I still don't understand how should I implement the barriers and conditionals

so that's clear that doing this is wrong because the early rejected threads still wouldn't reach the barrier:

[...]

if(length(raw_normal.xy) == 0.0f)

  {

    write_imagef(result, coords, raw_albedo);

    return;

  }

  barrier(CLK_LOCAL_MEM_FENCE); //threads entering the if conditional are not going to reach this line, because of the return instruction

[...]

so I tried to reorganize the way you suggested, but it doesn't solve anything:

[...]

if(length(raw_normal.xy) != 0.0f)

{

  //logically here would come the lighting part, but then the rejected threads wouldn't reach the barriers at the lighting part...

}

barrier(CLK_LOCAL_MEM_FENCE); //this way the conditional is useless, isn't it?

[...]

or I must have misunderstood something...

0 Likes
MicahVillmow
Staff
Staff

Re: tile based deferred shading kernel freezes video output

Jump to solution

What you need is something like this:

  1. __kernel void main(__read_only image2d_t albedo, //diffuse surface color from the g-buffer 
  2.            __read_only image2d_t normals,  //normals encoded using spheremap encoding 
  3.            __read_only image2d_t depth, //linear depth 
  4.            __write_only image2d_t result, //the output buffer that stores lighting data 
  5.            __global const float* far_plane, //the lower left and upper right corners of the far plane 
  6.            __global const float* in_view_pos, //view space camera position 
  7.            __global const float* in_lights, //1024 lights {light pos[3], diffuse_color[3], radius[1], specular intensity[1] } 
  8.            __global const float* in_num_of_lights, //number of incoming lights (1024) 
  9.            __global const float* in_projection_matrix) //the projection matrix is used for frustum culling 
  10.   /*
  11.    * Per pixel calculations (global)
  12.    */ 
  13.  
  14.   int2 coords = (int2)(get_global_id(0), get_global_id(1)); 
  15.   float4 raw_albedo = read_imagef(albedo, the_sampler, coords); 
  16.   float4 raw_normal = read_imagef(normals, the_sampler, coords); //this will store the decoded normals 
  17.  
  18. float4 out_color = (float4)(0.0f); //this will store the resulting color 
  19.   if(length(raw_normal.xy) == 0.0f) 
  20.   { 
  21. out_color = raw_albedo;
  22.   }  else {
  23.  
  24.   int2 global_size = (int2)(get_global_size(0), get_global_size(1)); 
  25.   float4 ll, ur; 
  26.  
  27.   float4 raw_depth = read_imagef(depth, the_sampler, coords); //this will store the decoded view space position 
  28.  
  29.   ll = (float4)(far_plane[0], far_plane[1], far_plane[2], 1.0f); 
  30.   ur = (float4)(far_plane[3], far_plane[4], far_plane[5], 1.0f); 
  31.  
  32.   float2 texel = (float2)((float)(coords.x) / (float)(global_size.x), (float)(coords.y) / (float)(global_size.y)); //texture coordinate [0...1] for input processing 
  33.  
  34.   raw_depth.xyz = decode_linear_depth(raw_depth, (float4)(my_mix(ll.x, ur.x, texel.x), my_mix(ll.y, ur.y, texel.y), ll.z, 1.0f)); 
  35.   raw_normal.xyz = decode_normals_spheremap(raw_normal); 
  36.  
  37. out_color.w = 1.0f; 
  38.  
  39.   /*
  40.    * Per workgroup (tile) calculations (local)
  41.    */ 
  42.  
  43.   __local float3 view_pos; //this will store the view space position (uniform among the workgroups, but stored as local for speedup) 
  44.   __local int num_of_lights; //num of lights (same here) 
  45.  
  46.   __local int2 local_coords; 
  47.   local_coords = (int2)(get_local_id(0), get_local_id(1)); 
  48.   __local int2 local_size; 
  49.   local_size = (int2)(get_local_size(0), get_local_size(1)); 
  50.   __local int workgroup_index; 
  51.   workgroup_index = local_coords.y * local_size.x + local_coords.x; 
  52.  
  53.   __local int tile_lights[1024]; //index of the lights visible per tile 
  54.   __local int num_of_tile_lights; //number of lights per tile 
  55.  
  56.   __local float2 tile_scale; //used for calculating frustum culling, taken from Intel's sample 
  57.   __local float2 tile_bias; 
  58.  
  59.   __local float4 column_1; 
  60.   __local float4 column_2; 
  61.   __local float4 column_4; 
  62.  
  63.   __local float4 frustum_planes[6]; 
  64.  
  65.   if(workgroup_index == 0
  66.   { 
  67.     view_pos = vload3(0, in_view_pos); 
  68.     num_of_lights = (int)in_num_of_lights[0]; 
  69.     num_of_lights = 32
  70.     num_of_tile_lights = 0
  71.  
  72.     //I'm not sure if the maths here is correct due to OpenGL and DirectX using different matrices, but this shouldnt matter 
  73.     tile_scale = (float2)(global_size.x, global_size.y) * (1.0f / (float)(2.0f * local_size.x)); 
  74.     tile_bias = tile_scale - (float2)(local_coords.x, local_coords.y); 
  75.  
  76.     column_1 = (float4)(in_projection_matrix[5] * tile_scale.x, 0.0f, tile_bias.x, 0.0f); 
  77.     column_2 = (float4)(0.0f, -in_projection_matrix[10] * tile_scale.y, tile_bias.y, 0.0f); 
  78.     column_4 = (float4)(0.0f, 0.0f, 1.0f, 0.0f); 
  79.  
  80.     frustum_planes[0] = column_4 - column_1; 
  81.     frustum_planes[1] = column_4 + column_1; 
  82.     frustum_planes[2] = column_4 - column_2; 
  83.     frustum_planes[3] = column_4 + column_2; 
  84.     frustum_planes[4] = (float4)(0.0f, 0.0f, -1.0f, near); 
  85.     frustum_planes[5] = (float4)(0.0f, 0.0f, 1.0f, far); 
  86.  
  87.     for(int c = 0; c < 4; c++) //normalize frustum plane normals 
  88.     { 
  89.       frustum_planes.xyz *= 1.0f / length(frustum_planes.xyz); 
  90.     } 
  91.   } 
  92.   }
  93.   barrier(CLK_LOCAL_MEM_FENCE); 
  94.   if (length(raw_normal.xy != 0.0f) {
  95.   for(int c = workgroup_index; c < num_of_lights; c += local_size.x * local_size.y) //cull each light per tile, each thread in a tile processes one light 
  96.   { 
  97.     bool in_frustum = true
  98.     float attenuation_end = 0.0f; 
  99.  
  100.     if(attenuation_type == 0
  101.     { 
  102.       attenuation_end = (float)(in_lights[c * 8 + 6]) / (float)(cutoff); //radius / cutoff 
  103.     } 
  104.     else 
  105.     { 
  106.       attenuation_end = (float)(in_lights[c * 8 + 6]); //radius 
  107.     } 
  108.  
  109.     for(int d = 0; d < 6; d++) //cull each light based on the distance where it will shine and the frustum defined by the tile 
  110.     { 
  111.       float e = dot(frustum_planes, (float4)(in_lights[c * 8 + 0], in_lights[c * 8 + 1], in_lights[c * 8 + 2], 1.0f)); 
  112.       in_frustum = in_frustum && (e >= -attenuation_end); 
  113.     } 
  114.  
  115.     if(in_frustum) //if the light is in the frustum, then store its index (if I comment this out, the kernel runs, but doesn't cull lights) 
  116.     { 
  117.       int index = atomic_inc(&num_of_tile_lights); 
  118.       tile_lights[index] = c; 
  119.     } 
  120.   } 
  121.   }
  122.   barrier(CLK_LOCAL_MEM_FENCE); 
  123.  
  124.   if (length(raw_normal.xy != 0.0f) {
  125.   /*
  126.    * Per light calculations
  127.    */ 
  128.  
  129.   for(int c = 0; c < num_of_tile_lights; c++) //draw each light per tile 
  130.   { 
  131.       int index = tile_lights; //get back the light index 
  132.       float3 light_pos = (float3)(in_lights[index * 8 + 0], in_lights[index * 8 + 1], in_lights[index * 8 + 2]); //gather light data using the index 
  133.       float light_radius = in_lights[index * 8 + 6]; 
  134.  
  135.       //calculate blinn-phong lighting with custom attenuation 
  136.       float3 light_dir = light_pos - raw_depth.xyz; 
  137.       float distance = length(light_dir); 
  138.       light_dir /= distance; 
  139.  
  140.       float coeff, attenuation; 
  141.  
  142.       if(attenuation_type == 0
  143.       { 
  144.     coeff = max(distance - light_radius, 0.0f) / light_radius + 1.0f; 
  145.     attenuation = max((1.0f / (coeff * coeff) - cutoff) / (1.0f - cutoff), 0.0f); 
  146.       } 
  147.       else 
  148.       { 
  149.     attenuation = (light_radius - distance) / (light_radius * 0.01f) * 0.01f; 
  150.       } 
  151.  
  152.       if(attenuation > 0.0f) 
  153.       { 
  154.     float3 light_diffuse_color = (float3)(in_lights[index * 8 + 3], in_lights[index * 8 + 4], in_lights[index * 8 + 5]); 
  155.     float light_specular_power = (float)in_lights[index * 8 + 3]; 
  156.     float3 view_dir = normalize(view_pos - raw_depth.xyz); 
  157.      
  158.     float3 half_vector = (light_dir + view_dir) * 0.5f; 
  159.     float n_dot_l = max(dot(raw_normal.xyz, light_dir), 0.0f); 
  160.  
  161.     out_color.xyz += raw_albedo.xyz * light_diffuse_color * n_dot_l * attenuation; 
  162.     float n_dot_h = pow(max(dot(raw_normal.xyz, half_vector), 0.0f), light_specular_power); 
  163.     out_color.xyz += light_diffuse_color * n_dot_h * attenuation; 
  164.       } 
  165.   } 
  166. }
  167.  
  168.   write_imagef(result, coords, out_color); //write the calculated light data to the result buffer (texture) 
yours3lf
Adept II

Re: tile based deferred shading kernel freezes video output

Jump to solution

thanks, now I understand!

I tried it out, and now it works

I shall now proceed to make culling actually work, so that lights are visible xD

Thanks again for the help!!!

0 Likes