cancel
Showing results for 
Search instead for 
Did you mean: 

Archives Discussions

yours3lf
Adept II

tile based deferred shading kernel freezes video output

Hi,

I'm writing a deferred renderer using the tile based deferred shading approach. I've written my kernel to compute the lighting, but it freezes my video output.

I guess the video card is overloaded, because only the video output freezes, if I listen to some music in the background, then it keeps playing.

If I remove the atomic operations from the kernel, then it runs fine, but the lights aren't culled this way.

so here's the kernel:

const float far = -10000.0f; //far plane distance

const float near = -1.0f; //near plane distance

const sampler_t the_sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;

const float cutoff = 0.25f; //0.005f

const int attenuation_type = 0; //linear or full attenuation?

float my_abs(float var) //these floating point operations aren't supported in opencl 1.1

{

  if(var < 0)

  {

    return -var;

  }

  else

  {

    return var;

  }

}

float my_mix(float x, float y, float weigth)

{

  return x * (1.0f - weigth) + y * weigth;

}

float3 my_reflect(float3 incident, float3 normal)

{

  return incident - 2.0 * dot(normal, incident) * normal;

}

float3 decode_normals_spheremap(float4 n) //decode normals from spheremap encoding

{

  float4 nn = n * (float4)(2.0, 2.0, 0.0, 0.0) + (float4)(-1.0, -1.0, 1.0, -1.0);

  float l = dot(nn.xyz, -nn.xyw);

  nn.z = l;

  nn.xy *= sqrt(l);

  return nn.xyz * 2.0 + (float3)(0.0, 0.0, -1.0);

}

float3 decode_linear_depth(float4 linear_depth, float4 position) //decode linear depth into view space position

{

  return (float3)(position.xy * (far / position.z), far) * linear_depth.x;

}

__kernel void main(__read_only image2d_t albedo, //diffuse surface color from the g-buffer

           __read_only image2d_t normals,  //normals encoded using spheremap encoding

           __read_only image2d_t depth, //linear depth

           __write_only image2d_t result, //the output buffer that stores lighting data

           __global const float* far_plane, //the lower left and upper right corners of the far plane

           __global const float* in_view_pos, //view space camera position

           __global const float* in_lights, //1024 lights {light pos[3], diffuse_color[3], radius[1], specular intensity[1] }

           __global const float* in_num_of_lights, //number of incoming lights (1024)

           __global const float* in_projection_matrix) //the projection matrix is used for frustum culling

{

  /*

   * Per pixel calculations (global)

   */

  int2 coords = (int2)(get_global_id(0), get_global_id(1));

  float4 raw_albedo = read_imagef(albedo, the_sampler, coords);

  float4 raw_normal = read_imagef(normals, the_sampler, coords); //this will store the decoded normals

  if(length(raw_normal.xy) == 0.0f)

  {

    write_imagef(result, coords, raw_albedo);

    return;

  }

  int2 global_size = (int2)(get_global_size(0), get_global_size(1));

  float4 ll, ur;

  float4 raw_depth = read_imagef(depth, the_sampler, coords); //this will store the decoded view space position

  ll = (float4)(far_plane[0], far_plane[1], far_plane[2], 1.0f);

  ur = (float4)(far_plane[3], far_plane[4], far_plane[5], 1.0f);

  float2 texel = (float2)((float)(coords.x) / (float)(global_size.x), (float)(coords.y) / (float)(global_size.y)); //texture coordinate [0...1] for input processing

  raw_depth.xyz = decode_linear_depth(raw_depth, (float4)(my_mix(ll.x, ur.x, texel.x), my_mix(ll.y, ur.y, texel.y), ll.z, 1.0f));

  raw_normal.xyz = decode_normals_spheremap(raw_normal);

  float4 out_color = (float4)(0.0f); //this will store the resulting color

  out_color.w = 1.0f;

  /*

   * Per workgroup (tile) calculations (local)

   */

  __local float3 view_pos; //this will store the view space position (uniform among the workgroups, but stored as local for speedup)

  __local int num_of_lights; //num of lights (same here)

  __local int2 local_coords;

  local_coords = (int2)(get_local_id(0), get_local_id(1));

  __local int2 local_size;

  local_size = (int2)(get_local_size(0), get_local_size(1));

  __local int workgroup_index;

  workgroup_index = local_coords.y * local_size.x + local_coords.x;

  __local int tile_lights[1024]; //index of the lights visible per tile

  __local int num_of_tile_lights; //number of lights per tile

  __local float2 tile_scale; //used for calculating frustum culling, taken from Intel's sample

  __local float2 tile_bias;

  __local float4 column_1;

  __local float4 column_2;

  __local float4 column_4;

  __local float4 frustum_planes[6];

  if(workgroup_index == 0)

  {

    view_pos = vload3(0, in_view_pos);

    num_of_lights = (int)in_num_of_lights[0];

    num_of_lights = 32;

    num_of_tile_lights = 0;

    //I'm not sure if the maths here is correct due to OpenGL and DirectX using different matrices, but this shouldnt matter

    tile_scale = (float2)(global_size.x, global_size.y) * (1.0f / (float)(2.0f * local_size.x));

    tile_bias = tile_scale - (float2)(local_coords.x, local_coords.y);

    column_1 = (float4)(in_projection_matrix[5] * tile_scale.x, 0.0f, tile_bias.x, 0.0f);

    column_2 = (float4)(0.0f, -in_projection_matrix[10] * tile_scale.y, tile_bias.y, 0.0f);

    column_4 = (float4)(0.0f, 0.0f, 1.0f, 0.0f);

    frustum_planes[0] = column_4 - column_1;

    frustum_planes[1] = column_4 + column_1;

    frustum_planes[2] = column_4 - column_2;

    frustum_planes[3] = column_4 + column_2;

    frustum_planes[4] = (float4)(0.0f, 0.0f, -1.0f, near);

    frustum_planes[5] = (float4)(0.0f, 0.0f, 1.0f, far);

    for(int c = 0; c < 4; c++) //normalize frustum plane normals

    {

      frustum_planes.xyz *= 1.0f / length(frustum_planes.xyz);

    }

  }

  barrier(CLK_LOCAL_MEM_FENCE);

  for(int c = workgroup_index; c < num_of_lights; c += local_size.x * local_size.y) //cull each light per tile, each thread in a tile processes one light

  {

    bool in_frustum = true;

    float attenuation_end = 0.0f;

    if(attenuation_type == 0)

    {

      attenuation_end = (float)(in_lights[c * 8 + 6]) / (float)(cutoff); //radius / cutoff

    }

    else

    {

      attenuation_end = (float)(in_lights[c * 8 + 6]); //radius

    }

    for(int d = 0; d < 6; d++) //cull each light based on the distance where it will shine and the frustum defined by the tile

    {

      float e = dot(frustum_planes, (float4)(in_lights[c * 8 + 0], in_lights[c * 8 + 1], in_lights[c * 8 + 2], 1.0f));

      in_frustum = in_frustum && (e >= -attenuation_end);

    }

    if(in_frustum) //if the light is in the frustum, then store its index (if I comment this out, the kernel runs, but doesn't cull lights)

    {

      int index = atomic_inc(&num_of_tile_lights);

      tile_lights[index] = c;

    }

  }

  barrier(CLK_LOCAL_MEM_FENCE);

  /*

   * Per light calculations

   */

  for(int c = 0; c < num_of_tile_lights; c++) //draw each light per tile

  {

      int index = tile_lights; //get back the light index

      float3 light_pos = (float3)(in_lights[index * 8 + 0], in_lights[index * 8 + 1], in_lights[index * 8 + 2]); //gather light data using the index

      float light_radius = in_lights[index * 8 + 6];

      //calculate blinn-phong lighting with custom attenuation

      float3 light_dir = light_pos - raw_depth.xyz;

      float distance = length(light_dir);

      light_dir /= distance;

      float coeff, attenuation;

      if(attenuation_type == 0)

      {

    coeff = max(distance - light_radius, 0.0f) / light_radius + 1.0f;

    attenuation = max((1.0f / (coeff * coeff) - cutoff) / (1.0f - cutoff), 0.0f);

      }

      else

      {

    attenuation = (light_radius - distance) / (light_radius * 0.01f) * 0.01f;

      }

      if(attenuation > 0.0f)

      {

    float3 light_diffuse_color = (float3)(in_lights[index * 8 + 3], in_lights[index * 8 + 4], in_lights[index * 8 + 5]);

    float light_specular_power = (float)in_lights[index * 8 + 3];

    float3 view_dir = normalize(view_pos - raw_depth.xyz);

   

    float3 half_vector = (light_dir + view_dir) * 0.5f;

    float n_dot_l = max(dot(raw_normal.xyz, light_dir), 0.0f);

    out_color.xyz += raw_albedo.xyz * light_diffuse_color * n_dot_l * attenuation;

    float n_dot_h = pow(max(dot(raw_normal.xyz, half_vector), 0.0f), light_specular_power);

    out_color.xyz += light_diffuse_color * n_dot_h * attenuation;

      }

  }

  write_imagef(result, coords, out_color); //write the calculated light data to the result buffer (texture)

}

Any ideas what going wrong?

Best regards,

Yours3!f

0 Likes
1 Solution

This kernel violates the OpenCL specification.

All work-items in a work-group must hit a barrier. However at lines 61-65, you have a conditional data dependent early exit, but a barrier after the early exit at line 137. This produces undefined behavior on the hardware, including hardware hangs.

View solution in original post

6 Replies

This kernel violates the OpenCL specification.

All work-items in a work-group must hit a barrier. However at lines 61-65, you have a conditional data dependent early exit, but a barrier after the early exit at line 137. This produces undefined behavior on the hardware, including hardware hangs.

yours3lf
Adept II

thanks MicahVillmow for the quick reply!

I tried putting a barrier(CLK_LOCAL_MEM_FENCE); after 61-65 but it didn't solve the problem.

0 Likes

yours3lf,

Yeah, because now only the threads that leave early are hitting that barrier and not the other barrier. The only way to fix this is re-organizing your algorithm so that ALL threads hit every barrier.

So instead of doing

if (a) return

....

barrier

...

barrier

...

Do:

if (!a) {

}

barrier

if (!a) {

}

barrier

if (!a) {

}

ok, so I removed the early rejection just to see if it's working, and it didn't freeze (and I got 70FPS, hurray!)

but I still don't understand how should I implement the barriers and conditionals

so that's clear that doing this is wrong because the early rejected threads still wouldn't reach the barrier:

[...]

if(length(raw_normal.xy) == 0.0f)

  {

    write_imagef(result, coords, raw_albedo);

    return;

  }

  barrier(CLK_LOCAL_MEM_FENCE); //threads entering the if conditional are not going to reach this line, because of the return instruction

[...]

so I tried to reorganize the way you suggested, but it doesn't solve anything:

[...]

if(length(raw_normal.xy) != 0.0f)

{

  //logically here would come the lighting part, but then the rejected threads wouldn't reach the barriers at the lighting part...

}

barrier(CLK_LOCAL_MEM_FENCE); //this way the conditional is useless, isn't it?

[...]

or I must have misunderstood something...

0 Likes

What you need is something like this:

  1. __kernel void main(__read_only image2d_t albedo, //diffuse surface color from the g-buffer 
  2.            __read_only image2d_t normals,  //normals encoded using spheremap encoding 
  3.            __read_only image2d_t depth, //linear depth 
  4.            __write_only image2d_t result, //the output buffer that stores lighting data 
  5.            __global const float* far_plane, //the lower left and upper right corners of the far plane 
  6.            __global const float* in_view_pos, //view space camera position 
  7.            __global const float* in_lights, //1024 lights {light pos[3], diffuse_color[3], radius[1], specular intensity[1] } 
  8.            __global const float* in_num_of_lights, //number of incoming lights (1024) 
  9.            __global const float* in_projection_matrix) //the projection matrix is used for frustum culling 
  10.   /*
  11.    * Per pixel calculations (global)
  12.    */ 
  13.  
  14.   int2 coords = (int2)(get_global_id(0), get_global_id(1)); 
  15.   float4 raw_albedo = read_imagef(albedo, the_sampler, coords); 
  16.   float4 raw_normal = read_imagef(normals, the_sampler, coords); //this will store the decoded normals 
  17.  
  18. float4 out_color = (float4)(0.0f); //this will store the resulting color 
  19.   if(length(raw_normal.xy) == 0.0f) 
  20.   { 
  21. out_color = raw_albedo;
  22.   }  else {
  23.  
  24.   int2 global_size = (int2)(get_global_size(0), get_global_size(1)); 
  25.   float4 ll, ur; 
  26.  
  27.   float4 raw_depth = read_imagef(depth, the_sampler, coords); //this will store the decoded view space position 
  28.  
  29.   ll = (float4)(far_plane[0], far_plane[1], far_plane[2], 1.0f); 
  30.   ur = (float4)(far_plane[3], far_plane[4], far_plane[5], 1.0f); 
  31.  
  32.   float2 texel = (float2)((float)(coords.x) / (float)(global_size.x), (float)(coords.y) / (float)(global_size.y)); //texture coordinate [0...1] for input processing 
  33.  
  34.   raw_depth.xyz = decode_linear_depth(raw_depth, (float4)(my_mix(ll.x, ur.x, texel.x), my_mix(ll.y, ur.y, texel.y), ll.z, 1.0f)); 
  35.   raw_normal.xyz = decode_normals_spheremap(raw_normal); 
  36.  
  37. out_color.w = 1.0f; 
  38.  
  39.   /*
  40.    * Per workgroup (tile) calculations (local)
  41.    */ 
  42.  
  43.   __local float3 view_pos; //this will store the view space position (uniform among the workgroups, but stored as local for speedup) 
  44.   __local int num_of_lights; //num of lights (same here) 
  45.  
  46.   __local int2 local_coords; 
  47.   local_coords = (int2)(get_local_id(0), get_local_id(1)); 
  48.   __local int2 local_size; 
  49.   local_size = (int2)(get_local_size(0), get_local_size(1)); 
  50.   __local int workgroup_index; 
  51.   workgroup_index = local_coords.y * local_size.x + local_coords.x; 
  52.  
  53.   __local int tile_lights[1024]; //index of the lights visible per tile 
  54.   __local int num_of_tile_lights; //number of lights per tile 
  55.  
  56.   __local float2 tile_scale; //used for calculating frustum culling, taken from Intel's sample 
  57.   __local float2 tile_bias; 
  58.  
  59.   __local float4 column_1; 
  60.   __local float4 column_2; 
  61.   __local float4 column_4; 
  62.  
  63.   __local float4 frustum_planes[6]; 
  64.  
  65.   if(workgroup_index == 0
  66.   { 
  67.     view_pos = vload3(0, in_view_pos); 
  68.     num_of_lights = (int)in_num_of_lights[0]; 
  69.     num_of_lights = 32
  70.     num_of_tile_lights = 0
  71.  
  72.     //I'm not sure if the maths here is correct due to OpenGL and DirectX using different matrices, but this shouldnt matter 
  73.     tile_scale = (float2)(global_size.x, global_size.y) * (1.0f / (float)(2.0f * local_size.x)); 
  74.     tile_bias = tile_scale - (float2)(local_coords.x, local_coords.y); 
  75.  
  76.     column_1 = (float4)(in_projection_matrix[5] * tile_scale.x, 0.0f, tile_bias.x, 0.0f); 
  77.     column_2 = (float4)(0.0f, -in_projection_matrix[10] * tile_scale.y, tile_bias.y, 0.0f); 
  78.     column_4 = (float4)(0.0f, 0.0f, 1.0f, 0.0f); 
  79.  
  80.     frustum_planes[0] = column_4 - column_1; 
  81.     frustum_planes[1] = column_4 + column_1; 
  82.     frustum_planes[2] = column_4 - column_2; 
  83.     frustum_planes[3] = column_4 + column_2; 
  84.     frustum_planes[4] = (float4)(0.0f, 0.0f, -1.0f, near); 
  85.     frustum_planes[5] = (float4)(0.0f, 0.0f, 1.0f, far); 
  86.  
  87.     for(int c = 0; c < 4; c++) //normalize frustum plane normals 
  88.     { 
  89.       frustum_planes.xyz *= 1.0f / length(frustum_planes.xyz); 
  90.     } 
  91.   } 
  92.   }
  93.   barrier(CLK_LOCAL_MEM_FENCE); 
  94.   if (length(raw_normal.xy != 0.0f) {
  95.   for(int c = workgroup_index; c < num_of_lights; c += local_size.x * local_size.y) //cull each light per tile, each thread in a tile processes one light 
  96.   { 
  97.     bool in_frustum = true
  98.     float attenuation_end = 0.0f; 
  99.  
  100.     if(attenuation_type == 0
  101.     { 
  102.       attenuation_end = (float)(in_lights[c * 8 + 6]) / (float)(cutoff); //radius / cutoff 
  103.     } 
  104.     else 
  105.     { 
  106.       attenuation_end = (float)(in_lights[c * 8 + 6]); //radius 
  107.     } 
  108.  
  109.     for(int d = 0; d < 6; d++) //cull each light based on the distance where it will shine and the frustum defined by the tile 
  110.     { 
  111.       float e = dot(frustum_planes, (float4)(in_lights[c * 8 + 0], in_lights[c * 8 + 1], in_lights[c * 8 + 2], 1.0f)); 
  112.       in_frustum = in_frustum && (e >= -attenuation_end); 
  113.     } 
  114.  
  115.     if(in_frustum) //if the light is in the frustum, then store its index (if I comment this out, the kernel runs, but doesn't cull lights) 
  116.     { 
  117.       int index = atomic_inc(&num_of_tile_lights); 
  118.       tile_lights[index] = c; 
  119.     } 
  120.   } 
  121.   }
  122.   barrier(CLK_LOCAL_MEM_FENCE); 
  123.  
  124.   if (length(raw_normal.xy != 0.0f) {
  125.   /*
  126.    * Per light calculations
  127.    */ 
  128.  
  129.   for(int c = 0; c < num_of_tile_lights; c++) //draw each light per tile 
  130.   { 
  131.       int index = tile_lights; //get back the light index 
  132.       float3 light_pos = (float3)(in_lights[index * 8 + 0], in_lights[index * 8 + 1], in_lights[index * 8 + 2]); //gather light data using the index 
  133.       float light_radius = in_lights[index * 8 + 6]; 
  134.  
  135.       //calculate blinn-phong lighting with custom attenuation 
  136.       float3 light_dir = light_pos - raw_depth.xyz; 
  137.       float distance = length(light_dir); 
  138.       light_dir /= distance; 
  139.  
  140.       float coeff, attenuation; 
  141.  
  142.       if(attenuation_type == 0
  143.       { 
  144.     coeff = max(distance - light_radius, 0.0f) / light_radius + 1.0f; 
  145.     attenuation = max((1.0f / (coeff * coeff) - cutoff) / (1.0f - cutoff), 0.0f); 
  146.       } 
  147.       else 
  148.       { 
  149.     attenuation = (light_radius - distance) / (light_radius * 0.01f) * 0.01f; 
  150.       } 
  151.  
  152.       if(attenuation > 0.0f) 
  153.       { 
  154.     float3 light_diffuse_color = (float3)(in_lights[index * 8 + 3], in_lights[index * 8 + 4], in_lights[index * 8 + 5]); 
  155.     float light_specular_power = (float)in_lights[index * 8 + 3]; 
  156.     float3 view_dir = normalize(view_pos - raw_depth.xyz); 
  157.      
  158.     float3 half_vector = (light_dir + view_dir) * 0.5f; 
  159.     float n_dot_l = max(dot(raw_normal.xyz, light_dir), 0.0f); 
  160.  
  161.     out_color.xyz += raw_albedo.xyz * light_diffuse_color * n_dot_l * attenuation; 
  162.     float n_dot_h = pow(max(dot(raw_normal.xyz, half_vector), 0.0f), light_specular_power); 
  163.     out_color.xyz += light_diffuse_color * n_dot_h * attenuation; 
  164.       } 
  165.   } 
  166. }
  167.  
  168.   write_imagef(result, coords, out_color); //write the calculated light data to the result buffer (texture) 

thanks, now I understand!

I tried it out, and now it works

I shall now proceed to make culling actually work, so that lights are visible xD

Thanks again for the help!!!

0 Likes