6 Replies Latest reply on Feb 2, 2012 1:51 PM by yours3lf

    tile based deferred shading kernel freezes video output

    yours3lf

      Hi,

       

      I'm writing a deferred renderer using the tile based deferred shading approach. I've written my kernel to compute the lighting, but it freezes my video output.

      I guess the video card is overloaded, because only the video output freezes, if I listen to some music in the background, then it keeps playing.

       

      If I remove the atomic operations from the kernel, then it runs fine, but the lights aren't culled this way.

       

      so here's the kernel:

      const float far = -10000.0f; //far plane distance
      const float near = -1.0f; //near plane distance
      const sampler_t the_sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
      const float cutoff = 0.25f; //0.005f
      const int attenuation_type = 0; //linear or full attenuation?
      
      float my_abs(float var) //these floating point operations aren't supported in opencl 1.1
      {
        if(var < 0)
        {
          return -var;
        }
        else
        {
          return var;
        }
      }
      
      float my_mix(float x, float y, float weigth)
      {
        return x * (1.0f - weigth) + y * weigth;
      }
      
      float3 my_reflect(float3 incident, float3 normal)
      {
        return incident - 2.0 * dot(normal, incident) * normal;
      }
      
      float3 decode_normals_spheremap(float4 n) //decode normals from spheremap encoding
      {
        float4 nn = n * (float4)(2.0, 2.0, 0.0, 0.0) + (float4)(-1.0, -1.0, 1.0, -1.0);
        float l = dot(nn.xyz, -nn.xyw);
        nn.z = l;
        nn.xy *= sqrt(l);
        return nn.xyz * 2.0 + (float3)(0.0, 0.0, -1.0);
      }
      
      float3 decode_linear_depth(float4 linear_depth, float4 position) //decode linear depth into view space position
      {
        return (float3)(position.xy * (far / position.z), far) * linear_depth.x;
      }
      
      __kernel void main(__read_only image2d_t albedo, //diffuse surface color from the g-buffer
                 __read_only image2d_t normals,  //normals encoded using spheremap encoding
                 __read_only image2d_t depth, //linear depth
                 __write_only image2d_t result, //the output buffer that stores lighting data
                 __global const float* far_plane, //the lower left and upper right corners of the far plane
                 __global const float* in_view_pos, //view space camera position
                 __global const float* in_lights, //1024 lights {light pos[3], diffuse_color[3], radius[1], specular intensity[1] }
                 __global const float* in_num_of_lights, //number of incoming lights (1024)
                 __global const float* in_projection_matrix) //the projection matrix is used for frustum culling
      {
        /*
         * Per pixel calculations (global)
         */
      
        int2 coords = (int2)(get_global_id(0), get_global_id(1));
        float4 raw_albedo = read_imagef(albedo, the_sampler, coords);
        float4 raw_normal = read_imagef(normals, the_sampler, coords); //this will store the decoded normals
      
        if(length(raw_normal.xy) == 0.0f)
        {
          write_imagef(result, coords, raw_albedo);
          return;
        }
      
        int2 global_size = (int2)(get_global_size(0), get_global_size(1));
        float4 ll, ur;
      
        float4 raw_depth = read_imagef(depth, the_sampler, coords); //this will store the decoded view space position
      
        ll = (float4)(far_plane[0], far_plane[1], far_plane[2], 1.0f);
        ur = (float4)(far_plane[3], far_plane[4], far_plane[5], 1.0f);
      
        float2 texel = (float2)((float)(coords.x) / (float)(global_size.x), (float)(coords.y) / (float)(global_size.y)); //texture coordinate [0...1] for input processing
      
        raw_depth.xyz = decode_linear_depth(raw_depth, (float4)(my_mix(ll.x, ur.x, texel.x), my_mix(ll.y, ur.y, texel.y), ll.z, 1.0f));
        raw_normal.xyz = decode_normals_spheremap(raw_normal);
      
        float4 out_color = (float4)(0.0f); //this will store the resulting color
        out_color.w = 1.0f;
      
        /*
         * Per workgroup (tile) calculations (local)
         */
      
        __local float3 view_pos; //this will store the view space position (uniform among the workgroups, but stored as local for speedup)
        __local int num_of_lights; //num of lights (same here)
      
        __local int2 local_coords;
        local_coords = (int2)(get_local_id(0), get_local_id(1));
        __local int2 local_size;
        local_size = (int2)(get_local_size(0), get_local_size(1));
        __local int workgroup_index;
        workgroup_index = local_coords.y * local_size.x + local_coords.x;
      
        __local int tile_lights[1024]; //index of the lights visible per tile
        __local int num_of_tile_lights; //number of lights per tile
      
        __local float2 tile_scale; //used for calculating frustum culling, taken from Intel's sample
        __local float2 tile_bias;
      
        __local float4 column_1;
        __local float4 column_2;
        __local float4 column_4;
      
        __local float4 frustum_planes[6];
      
        if(workgroup_index == 0)
        {
          view_pos = vload3(0, in_view_pos);
          num_of_lights = (int)in_num_of_lights[0];
          num_of_lights = 32;
          num_of_tile_lights = 0;
      
          //I'm not sure if the maths here is correct due to OpenGL and DirectX using different matrices, but this shouldnt matter
          tile_scale = (float2)(global_size.x, global_size.y) * (1.0f / (float)(2.0f * local_size.x));
          tile_bias = tile_scale - (float2)(local_coords.x, local_coords.y);
      
          column_1 = (float4)(in_projection_matrix[5] * tile_scale.x, 0.0f, tile_bias.x, 0.0f);
          column_2 = (float4)(0.0f, -in_projection_matrix[10] * tile_scale.y, tile_bias.y, 0.0f);
          column_4 = (float4)(0.0f, 0.0f, 1.0f, 0.0f);
      
          frustum_planes[0] = column_4 - column_1;
          frustum_planes[1] = column_4 + column_1;
          frustum_planes[2] = column_4 - column_2;
          frustum_planes[3] = column_4 + column_2;
          frustum_planes[4] = (float4)(0.0f, 0.0f, -1.0f, near);
          frustum_planes[5] = (float4)(0.0f, 0.0f, 1.0f, far);
      
          for(int c = 0; c < 4; c++) //normalize frustum plane normals
          {
            frustum_planes[c].xyz *= 1.0f / length(frustum_planes[c].xyz);
          }
        }
      
        barrier(CLK_LOCAL_MEM_FENCE);
      
        for(int c = workgroup_index; c < num_of_lights; c += local_size.x * local_size.y) //cull each light per tile, each thread in a tile processes one light
        {
          bool in_frustum = true;
          float attenuation_end = 0.0f;
      
          if(attenuation_type == 0)
          {
            attenuation_end = (float)(in_lights[c * 8 + 6]) / (float)(cutoff); //radius / cutoff
          }
          else
          {
            attenuation_end = (float)(in_lights[c * 8 + 6]); //radius
          }
      
          for(int d = 0; d < 6; d++) //cull each light based on the distance where it will shine and the frustum defined by the tile
          {
            float e = dot(frustum_planes[d], (float4)(in_lights[c * 8 + 0], in_lights[c * 8 + 1], in_lights[c * 8 + 2], 1.0f));
            in_frustum = in_frustum && (e >= -attenuation_end);
          }
      
          if(in_frustum) //if the light is in the frustum, then store its index (if I comment this out, the kernel runs, but doesn't cull lights)
          {
            int index = atomic_inc(&num_of_tile_lights);
            tile_lights[index] = c;
          }
        }
      
        barrier(CLK_LOCAL_MEM_FENCE);
      
        /*
         * Per light calculations
         */
      
        for(int c = 0; c < num_of_tile_lights; c++) //draw each light per tile
        {
            int index = tile_lights[c]; //get back the light index
            float3 light_pos = (float3)(in_lights[index * 8 + 0], in_lights[index * 8 + 1], in_lights[index * 8 + 2]); //gather light data using the index
            float light_radius = in_lights[index * 8 + 6];
      
            //calculate blinn-phong lighting with custom attenuation
            float3 light_dir = light_pos - raw_depth.xyz;
            float distance = length(light_dir);
            light_dir /= distance;
      
            float coeff, attenuation;
      
            if(attenuation_type == 0)
            {
          coeff = max(distance - light_radius, 0.0f) / light_radius + 1.0f;
          attenuation = max((1.0f / (coeff * coeff) - cutoff) / (1.0f - cutoff), 0.0f);
            }
            else
            {
          attenuation = (light_radius - distance) / (light_radius * 0.01f) * 0.01f;
            }
      
            if(attenuation > 0.0f)
            {
          float3 light_diffuse_color = (float3)(in_lights[index * 8 + 3], in_lights[index * 8 + 4], in_lights[index * 8 + 5]);
          float light_specular_power = (float)in_lights[index * 8 + 3];
          float3 view_dir = normalize(view_pos - raw_depth.xyz);
          
          float3 half_vector = (light_dir + view_dir) * 0.5f;
          float n_dot_l = max(dot(raw_normal.xyz, light_dir), 0.0f);
      
          out_color.xyz += raw_albedo.xyz * light_diffuse_color * n_dot_l * attenuation;
          float n_dot_h = pow(max(dot(raw_normal.xyz, half_vector), 0.0f), light_specular_power);
          out_color.xyz += light_diffuse_color * n_dot_h * attenuation;
            }
        }
      
        write_imagef(result, coords, out_color); //write the calculated light data to the result buffer (texture)
      }
      

       

      Any ideas what going wrong?

       

      Best regards,

      Yours3!f

        • Re: tile based deferred shading kernel freezes video output
          MicahVillmow

          This kernel violates the OpenCL specification.

          All work-items in a work-group must hit a barrier. However at lines 61-65, you have a conditional data dependent early exit, but a barrier after the early exit at line 137. This produces undefined behavior on the hardware, including hardware hangs.

          • Re: tile based deferred shading kernel freezes video output
            yours3lf

            thanks MicahVillmow for the quick reply!

             

            I tried putting a barrier(CLK_LOCAL_MEM_FENCE); after 61-65 but it didn't solve the problem.

              • Re: tile based deferred shading kernel freezes video output
                MicahVillmow

                yours3lf,

                Yeah, because now only the threads that leave early are hitting that barrier and not the other barrier. The only way to fix this is re-organizing your algorithm so that ALL threads hit every barrier.

                 

                So instead of doing

                if (a) return

                ....

                barrier

                ...

                barrier

                ...

                Do:

                if (!a) {

                }

                barrier

                if (!a) {

                }

                barrier

                if (!a) {

                }

                1 of 1 people found this helpful
                  • Re: tile based deferred shading kernel freezes video output
                    yours3lf

                    ok, so I removed the early rejection just to see if it's working, and it didn't freeze (and I got 70FPS, hurray!)

                     

                    but I still don't understand how should I implement the barriers and conditionals

                     

                    so that's clear that doing this is wrong because the early rejected threads still wouldn't reach the barrier:

                    [...]
                    
                    if(length(raw_normal.xy) == 0.0f)
                      {
                        write_imagef(result, coords, raw_albedo);
                        return;
                      }
                    
                      barrier(CLK_LOCAL_MEM_FENCE); //threads entering the if conditional are not going to reach this line, because of the return instruction
                    
                    [...]
                    

                     

                    so I tried to reorganize the way you suggested, but it doesn't solve anything:

                     

                    [...]
                    
                    if(length(raw_normal.xy) != 0.0f)
                    {
                      //logically here would come the lighting part, but then the rejected threads wouldn't reach the barriers at the lighting part...
                    }
                    barrier(CLK_LOCAL_MEM_FENCE); //this way the conditional is useless, isn't it?
                    
                    [...]
                    

                     

                    or I must have misunderstood something...

                      • Re: tile based deferred shading kernel freezes video output
                        MicahVillmow

                        What you need is something like this:

                        1. __kernel void main(__read_only image2d_t albedo, //diffuse surface color from the g-buffer 
                        2.            __read_only image2d_t normals,  //normals encoded using spheremap encoding 
                        3.            __read_only image2d_t depth, //linear depth 
                        4.            __write_only image2d_t result, //the output buffer that stores lighting data 
                        5.            __global const float* far_plane, //the lower left and upper right corners of the far plane 
                        6.            __global const float* in_view_pos, //view space camera position 
                        7.            __global const float* in_lights, //1024 lights {light pos[3], diffuse_color[3], radius[1], specular intensity[1] } 
                        8.            __global const float* in_num_of_lights, //number of incoming lights (1024) 
                        9.            __global const float* in_projection_matrix) //the projection matrix is used for frustum culling 
                        10.   /*
                        11.    * Per pixel calculations (global)
                        12.    */ 
                        13.  
                        14.   int2 coords = (int2)(get_global_id(0), get_global_id(1)); 
                        15.   float4 raw_albedo = read_imagef(albedo, the_sampler, coords); 
                        16.   float4 raw_normal = read_imagef(normals, the_sampler, coords); //this will store the decoded normals 
                        17.  
                        18. float4 out_color = (float4)(0.0f); //this will store the resulting color 
                        19.   if(length(raw_normal.xy) == 0.0f) 
                        20.   { 
                        21. out_color = raw_albedo;
                        22.   }  else {
                        23.  
                        24.   int2 global_size = (int2)(get_global_size(0), get_global_size(1)); 
                        25.   float4 ll, ur; 
                        26.  
                        27.   float4 raw_depth = read_imagef(depth, the_sampler, coords); //this will store the decoded view space position 
                        28.  
                        29.   ll = (float4)(far_plane[0], far_plane[1], far_plane[2], 1.0f); 
                        30.   ur = (float4)(far_plane[3], far_plane[4], far_plane[5], 1.0f); 
                        31.  
                        32.   float2 texel = (float2)((float)(coords.x) / (float)(global_size.x), (float)(coords.y) / (float)(global_size.y)); //texture coordinate [0...1] for input processing 
                        33.  
                        34.   raw_depth.xyz = decode_linear_depth(raw_depth, (float4)(my_mix(ll.x, ur.x, texel.x), my_mix(ll.y, ur.y, texel.y), ll.z, 1.0f)); 
                        35.   raw_normal.xyz = decode_normals_spheremap(raw_normal); 
                        36.  
                        37. out_color.w = 1.0f; 
                        38.  
                        39.   /*
                        40.    * Per workgroup (tile) calculations (local)
                        41.    */ 
                        42.  
                        43.   __local float3 view_pos; //this will store the view space position (uniform among the workgroups, but stored as local for speedup) 
                        44.   __local int num_of_lights; //num of lights (same here) 
                        45.  
                        46.   __local int2 local_coords; 
                        47.   local_coords = (int2)(get_local_id(0), get_local_id(1)); 
                        48.   __local int2 local_size; 
                        49.   local_size = (int2)(get_local_size(0), get_local_size(1)); 
                        50.   __local int workgroup_index; 
                        51.   workgroup_index = local_coords.y * local_size.x + local_coords.x; 
                        52.  
                        53.   __local int tile_lights[1024]; //index of the lights visible per tile 
                        54.   __local int num_of_tile_lights; //number of lights per tile 
                        55.  
                        56.   __local float2 tile_scale; //used for calculating frustum culling, taken from Intel's sample 
                        57.   __local float2 tile_bias; 
                        58.  
                        59.   __local float4 column_1; 
                        60.   __local float4 column_2; 
                        61.   __local float4 column_4; 
                        62.  
                        63.   __local float4 frustum_planes[6]; 
                        64.  
                        65.   if(workgroup_index == 0
                        66.   { 
                        67.     view_pos = vload3(0, in_view_pos); 
                        68.     num_of_lights = (int)in_num_of_lights[0]; 
                        69.     num_of_lights = 32
                        70.     num_of_tile_lights = 0
                        71.  
                        72.     //I'm not sure if the maths here is correct due to OpenGL and DirectX using different matrices, but this shouldnt matter 
                        73.     tile_scale = (float2)(global_size.x, global_size.y) * (1.0f / (float)(2.0f * local_size.x)); 
                        74.     tile_bias = tile_scale - (float2)(local_coords.x, local_coords.y); 
                        75.  
                        76.     column_1 = (float4)(in_projection_matrix[5] * tile_scale.x, 0.0f, tile_bias.x, 0.0f); 
                        77.     column_2 = (float4)(0.0f, -in_projection_matrix[10] * tile_scale.y, tile_bias.y, 0.0f); 
                        78.     column_4 = (float4)(0.0f, 0.0f, 1.0f, 0.0f); 
                        79.  
                        80.     frustum_planes[0] = column_4 - column_1; 
                        81.     frustum_planes[1] = column_4 + column_1; 
                        82.     frustum_planes[2] = column_4 - column_2; 
                        83.     frustum_planes[3] = column_4 + column_2; 
                        84.     frustum_planes[4] = (float4)(0.0f, 0.0f, -1.0f, near); 
                        85.     frustum_planes[5] = (float4)(0.0f, 0.0f, 1.0f, far); 
                        86.  
                        87.     for(int c = 0; c < 4; c++) //normalize frustum plane normals 
                        88.     { 
                        89.       frustum_planes[c].xyz *= 1.0f / length(frustum_planes[c].xyz); 
                        90.     } 
                        91.   } 
                        92.   }
                        93.   barrier(CLK_LOCAL_MEM_FENCE); 
                        94.   if (length(raw_normal.xy != 0.0f) {
                        95.   for(int c = workgroup_index; c < num_of_lights; c += local_size.x * local_size.y) //cull each light per tile, each thread in a tile processes one light 
                        96.   { 
                        97.     bool in_frustum = true
                        98.     float attenuation_end = 0.0f; 
                        99.  
                        100.     if(attenuation_type == 0
                        101.     { 
                        102.       attenuation_end = (float)(in_lights[c * 8 + 6]) / (float)(cutoff); //radius / cutoff 
                        103.     } 
                        104.     else 
                        105.     { 
                        106.       attenuation_end = (float)(in_lights[c * 8 + 6]); //radius 
                        107.     } 
                        108.  
                        109.     for(int d = 0; d < 6; d++) //cull each light based on the distance where it will shine and the frustum defined by the tile 
                        110.     { 
                        111.       float e = dot(frustum_planes[d], (float4)(in_lights[c * 8 + 0], in_lights[c * 8 + 1], in_lights[c * 8 + 2], 1.0f)); 
                        112.       in_frustum = in_frustum && (e >= -attenuation_end); 
                        113.     } 
                        114.  
                        115.     if(in_frustum) //if the light is in the frustum, then store its index (if I comment this out, the kernel runs, but doesn't cull lights) 
                        116.     { 
                        117.       int index = atomic_inc(&num_of_tile_lights); 
                        118.       tile_lights[index] = c; 
                        119.     } 
                        120.   } 
                        121.   }
                        122.   barrier(CLK_LOCAL_MEM_FENCE); 
                        123.  
                        124.   if (length(raw_normal.xy != 0.0f) {
                        125.   /*
                        126.    * Per light calculations
                        127.    */ 
                        128.  
                        129.   for(int c = 0; c < num_of_tile_lights; c++) //draw each light per tile 
                        130.   { 
                        131.       int index = tile_lights[c]; //get back the light index 
                        132.       float3 light_pos = (float3)(in_lights[index * 8 + 0], in_lights[index * 8 + 1], in_lights[index * 8 + 2]); //gather light data using the index 
                        133.       float light_radius = in_lights[index * 8 + 6]; 
                        134.  
                        135.       //calculate blinn-phong lighting with custom attenuation 
                        136.       float3 light_dir = light_pos - raw_depth.xyz; 
                        137.       float distance = length(light_dir); 
                        138.       light_dir /= distance; 
                        139.  
                        140.       float coeff, attenuation; 
                        141.  
                        142.       if(attenuation_type == 0
                        143.       { 
                        144.     coeff = max(distance - light_radius, 0.0f) / light_radius + 1.0f; 
                        145.     attenuation = max((1.0f / (coeff * coeff) - cutoff) / (1.0f - cutoff), 0.0f); 
                        146.       } 
                        147.       else 
                        148.       { 
                        149.     attenuation = (light_radius - distance) / (light_radius * 0.01f) * 0.01f; 
                        150.       } 
                        151.  
                        152.       if(attenuation > 0.0f) 
                        153.       { 
                        154.     float3 light_diffuse_color = (float3)(in_lights[index * 8 + 3], in_lights[index * 8 + 4], in_lights[index * 8 + 5]); 
                        155.     float light_specular_power = (float)in_lights[index * 8 + 3]; 
                        156.     float3 view_dir = normalize(view_pos - raw_depth.xyz); 
                        157.      
                        158.     float3 half_vector = (light_dir + view_dir) * 0.5f; 
                        159.     float n_dot_l = max(dot(raw_normal.xyz, light_dir), 0.0f); 
                        160.  
                        161.     out_color.xyz += raw_albedo.xyz * light_diffuse_color * n_dot_l * attenuation; 
                        162.     float n_dot_h = pow(max(dot(raw_normal.xyz, half_vector), 0.0f), light_specular_power); 
                        163.     out_color.xyz += light_diffuse_color * n_dot_h * attenuation; 
                        164.       } 
                        165.   } 
                        166. }
                        167.  
                        168.   write_imagef(result, coords, out_color); //write the calculated light data to the result buffer (texture) 
                        1 of 1 people found this helpful