AnsweredAssumed Answered

kernel freezes video output... again

Question asked by yours3lf on Feb 6, 2012
Latest reply on Mar 20, 2012 by yours3lf

Hi,

 

I tried to get the culling working today, but I ran into the same freezing issue, but this time everything seems to be working the scene goes on with ~70fps, but after a few seconds it freezes (as opposed to immediate freezing...)

 

I'm not sure if it's related to the same problem because all the barriers must be hit by every work-item, since I used the method MicahVillmow showed me here: http://devgurus.amd.com/thread/158555

 

So this is how my kernel looks like:

 

__constant float far = -10000.0f; //far plane distance
__constant float near = -1.0f; //near plane distance
__constant sampler_t the_sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
__constant float cutoff = 0.25f; //0.005f
__constant int attenuation_type = 0; //linear or full attenuation?

float my_abs( float var ) //these floating point operations aren't supported in opencl 1.1
{
  if ( var < 0 )
  {
    return -var;
  }
  else
  {
    return var;
  }
}

float my_mix( float x, float y, float weigth )
{
  return x * ( 1.0f - weigth ) + y * weigth;
}

float3 my_reflect( float3 incident, float3 normal )
{
  return incident - 2.0f * dot( normal, incident ) * normal;
}

float3 decode_normals_spheremap( float4 n ) //decode normals from spheremap encoding
{
  float4 nn = n * ( float4 )( 2.0f, 2.0f, 0.0f, 0.0f ) + ( float4 )( -1.0f, -1.0f, 1.0f, -1.0f );
  float l = dot( nn.xyz, -nn.xyw );
  nn.z = l;
  nn.xy *= sqrt( l );
  return nn.xyz * 2.0f + ( float3 )( 0.0f, 0.0f, -1.0f );
}

float3 decode_linear_depth( float4 linear_depth, float4 position ) //decode linear depth into view space position
{
  return ( float3 )( position.xy * ( far / position.z ), far ) * linear_depth.x;
}

__kernel void main( __read_only image2d_t albedo, //diffuse surface color from the g-buffer
                    __read_only image2d_t normals, //normals encoded using spheremap encoding
                    __read_only image2d_t depth, //linear depth
                    __write_only image2d_t result, //the output buffer that stores lighting data
                    __global const float* far_plane, //the lower left and upper right corners of the far plane
                    __global const float* in_view_pos, //view space camera position
                    __global const float* in_lights, //1024 lights {light pos[3], diffuse_color[3], radius[1], specular intensity[1] }
                    __global const float* in_num_of_lights, //number of incoming lights (1024)
                    __global const float* in_projection_matrix ) //the projection matrix is used for frustum culling
{
  /*
   * Per pixel calculations (global)
   */

  int2 coords = ( int2 )( get_global_id( 0 ), get_global_id( 1 ) );
  float4 raw_albedo = read_imagef( albedo, the_sampler, coords );
  float4 raw_normal = read_imagef( normals, the_sampler, coords ); //this will store the decoded normals

  float4 out_color = ( float4 )( 0.0f ); //this will store the resulting color
  out_color.w = 1.0f;

  float4 raw_depth; //this will store the decoded view space position
  int2 global_size = ( int2 )( get_global_size( 0 ), get_global_size( 1 ) );

  /*
   * Per tile data
   */

  float3 view_pos; //this will store the view space position (uniform among the workgroups, but stored as local for speedup)
  int num_of_lights; //num of lights (same here)

  int2 local_coords;
  int2 local_size;
  int workgroup_index;

  __local int tile_lights[1024]; //index of the lights visible per tile
  __local int num_of_tile_lights; //number of lights per tile

  float2 tile_scale; //used for calculating frustum culling, taken from Intel's sample
  float2 tile_bias;

  float4 column_1;
  float4 column_2;
  float4 column_4;

  float4 frustum_planes[6];

  /*
   * Check for skybox
   */

  bool early_rejection = ( length( raw_normal.xy ) == 0.0f );

  if ( early_rejection )
  {
    out_color = raw_albedo;
  }
  else
  {
    local_coords = ( int2 )( get_local_id( 0 ), get_local_id( 1 ) );
    local_size = ( int2 )( get_local_size( 0 ), get_local_size( 1 ) );

    workgroup_index = local_coords.y * local_size.x + local_coords.x;

    float4 ll, ur;

    raw_depth = read_imagef( depth, the_sampler, coords );

    ll = ( float4 )( far_plane[0], far_plane[1], far_plane[2], 1.0f );
    ur = ( float4 )( far_plane[3], far_plane[4], far_plane[5], 1.0f );

    //texture coordinate [0...1] for input processing
    float2 texel = ( float2 )(( float )( coords.x ) / ( float )( global_size.x ), ( float )( coords.y ) / ( float )( global_size.y ) );

    raw_depth.xyz = decode_linear_depth( raw_depth, ( float4 )( my_mix( ll.x, ur.x, texel.x ), my_mix( ll.y, ur.y, texel.y ), ll.z, 1.0f ) );
    raw_normal.xyz = decode_normals_spheremap( raw_normal );

    view_pos = vload3( 0, in_view_pos );
    num_of_lights = ( int )in_num_of_lights[0];

    //I'm not sure if the maths here is correct due to OpenGL and DirectX using different matrices, but this shouldnt matter
    tile_scale = ( float2 )( global_size.x, global_size.y ) * ( 1.0f / ( float )( 2.0f * local_size.x ) );
    tile_bias = tile_scale - ( float2 )( local_coords.x, local_coords.y );

    column_1 = ( float4 )( in_projection_matrix[5] * tile_scale.x, 0.0f, tile_bias.x, 0.0f );
    column_2 = ( float4 )( 0.0f, -in_projection_matrix[10] * tile_scale.y, tile_bias.y, 0.0f );
    column_4 = ( float4 )( 0.0f, 0.0f, 1.0f, 0.0f );

    frustum_planes[0] = column_4 - column_1;
    frustum_planes[1] = column_4 + column_1;
    frustum_planes[2] = column_4 - column_2;
    frustum_planes[3] = column_4 + column_2;
    frustum_planes[4] = ( float4 )( 0.0f, 0.0f, -1.0f, near );
    frustum_planes[5] = ( float4 )( 0.0f, 0.0f, 1.0f, far );

    for ( int c = 0; c < 4; c++ ) //normalize frustum plane normals
    {
      frustum_planes[c].xyz *= 1.0f / length( frustum_planes[c].xyz );
    }

    /*
    * Per workgroup (tile) calculations (local)
    */

    if ( workgroup_index == 0 )
    {
      num_of_tile_lights = 0;
    }
  }

  barrier( CLK_LOCAL_MEM_FENCE );

  if ( !early_rejection )
  {
    for ( int c = workgroup_index; c < num_of_lights; c += local_size.x * local_size.y ) //cull each light per tile, each thread in a tile processes one light
    {
      if ( c < num_of_lights )
      {
        bool in_frustum = true;
        float attenuation_end = 0.0f;

        if ( attenuation_type == 0 )
        {
          attenuation_end = ( float )( in_lights[c * 8 + 6] ) / ( float )( cutoff ); //radius / cutoff
        }
        else
        {
          attenuation_end = ( float )( in_lights[c * 8 + 6] ); //radius
        }

        for ( int d = 0; d < 6; d++ ) //cull each light based on the distance where it will shine and the frustum defined by the tile
        {
          float e = dot( frustum_planes[d], ( float4 )( in_lights[c * 8 + 0], in_lights[c * 8 + 1], in_lights[c * 8 + 2], 1.0f ) );
          in_frustum = in_frustum && ( e >= -attenuation_end );
        }

        if ( in_frustum ) //if the light is in the frustum, then store its index (if I comment this out, the kernel runs, but doesn't cull lights)
        {
          int index = atomic_inc( &num_of_tile_lights );
          tile_lights[index] = c;
          if ( num_of_tile_lights != 0 )
          {
            out_color = ( float4 )( 0.0f, 1.0f, 0.0f, 1.0f );
          }
        }
      }
    }
  }

  barrier( CLK_LOCAL_MEM_FENCE );

  /*
   * Per light calculations
   */

  if ( !early_rejection )
  {
    for ( int c = 0; c < num_of_tile_lights; c++ ) //draw each light per tile
    {
      int index = tile_lights[c]; //get back the light index
      float3 light_pos = ( float3 )( in_lights[index * 8 + 0], in_lights[index * 8 + 1], in_lights[index * 8 + 2] ); //gather light data using the index
      float light_radius = in_lights[index * 8 + 6];

      //calculate blinn-phong lighting with custom attenuation
      float3 light_dir = light_pos - raw_depth.xyz;
      float distance = length( light_dir );
      light_dir /= distance;

      float coeff, attenuation;

      if ( attenuation_type == 0 )
      {
        coeff = max( distance - light_radius, 0.0f ) / light_radius + 1.0f;
        attenuation = max(( 1.0f / ( coeff * coeff ) - cutoff ) / ( 1.0f - cutoff ), 0.0f );
      }
      else
      {
        attenuation = ( light_radius - distance ) / ( light_radius * 0.01f ) * 0.01f;
      }

      if ( attenuation > 0.0f )
      {
        float3 light_diffuse_color = ( float3 )( in_lights[index * 8 + 3], in_lights[index * 8 + 4], in_lights[index * 8 + 5] );
        float light_specular_power = ( float )in_lights[index * 8 + 3];
        float3 view_dir = normalize( view_pos - raw_depth.xyz );

        float3 half_vector = ( light_dir + view_dir ) * 0.5f;
        float n_dot_l = max( dot( raw_normal.xyz, light_dir ), 0.0f );

        out_color.xyz += raw_albedo.xyz * light_diffuse_color * n_dot_l * attenuation;
        float n_dot_h = pow( max( dot( raw_normal.xyz, half_vector ), 0.0f ), light_specular_power );
        out_color.xyz += light_diffuse_color * n_dot_h * attenuation;
      } 
    }
  }

  if ( coords.x < global_size.x && coords.y < global_size.y )
  {
    write_imagef( result, coords, out_color ); //write the calculated light data to the result buffer (texture)
  }
} 

 

Best regards,

Yours3lf

Outcomes