dikobraz

Driver crashes trying to compile kernel

Discussion created by dikobraz on Mar 6, 2012

I encountered problem with AMD card when trying to compile the following kernel:

 

#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable

 

__constant sampler_t proj_sampler = CLK_NORMALIZED_COORDS_TRUE  | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;

 

// common

float4 transform(float16 mat, float4 vec)

{

    float4 res;

    res.x = dot(mat.s0123, vec);

    res.y = dot(mat.s4567, vec);

    res.z = dot(mat.s89AB, vec);

    res.w = dot(mat.sCDEF, vec);

    return res;

}

 

struct block_desc

{

    int3   blockCoord;        /// active block

    int3   blockCount;        /// num blocks in each direction

    float3 offset;            /// offset of the block in world coordinates

    int3   blockSize;        /// size of the voxel grid inside block

    int3   blockSizeShift;    /// helper to get voxel coordinate by voxel index

    int3   blockSizeMask;    /// helper to get voxel coordinate by voxel index

    float3 voxelSize;        /// size of the voxel of the grid

};

 

// compute position in 3d grid from 1d index

// only works for power of 2 sizes

int3 calcGridPos(uint i,

                 int3 gridSizeShift,

                 int3 gridSizeMask)

{

    int3 gridPos;

    gridPos.x = (i & gridSizeMask.x);

    gridPos.y = ((i >> gridSizeShift.y) & gridSizeMask.y);

    gridPos.z = ((i >> gridSizeShift.z) & gridSizeMask.z);

    return gridPos;

}

 

__kernel

void updateDistanceFieldProj(__read_only image2d_t  normalDepthMap,

                             uint                   numOccupiedVoxels,

                             uint                   blockOffset,

                             float16                modelViewMatrix,

                             float16                projMatrix,

                             float16                invProjMatrix,

                             struct block_desc      block,

                             __global char*         voxels,

                             __global uchar*        voxelsWeights)

{

    // compute position in 3d grid

    uint   voxel      = min(get_global_id(0), numOccupiedVoxels - 1);

    int3   gridPos    = calcGridPos(voxel, block.blockSizeShift, block.blockSizeMask);

    float4 cellCenter = (float4)(block.offset + (convert_float3(gridPos) + 0.5f) * block.voxelSize, 1.0f);

    cellCenter        = transform(modelViewMatrix, cellCenter);

   

    // project in the texture

    float4 projected  = transform(projMatrix, cellCenter);

    float2 tcoord     = 0.5f * projected.xy / projected.w + 0.5f;

    float  depth      = read_imagef(normalDepthMap, proj_sampler, tcoord).w;

 

    // if something is in depth map update distance field

    if (depth < 1.0f)

    {

        // reconstruct position in view space

        projected.xy /= projected.w;

        projected.z   = depth;

        projected.w   = 1.0f;

        float4 pointOnSurface = transform(invProjMatrix, projected);

        pointOnSurface.xyz /= pointOnSurface.w;

 

        // use simple distance metric, may be it is better to replace it to point-to-plane

        float dist = (cellCenter.z - pointOnSurface.z) * 0.3f / block.voxelSize.x;

 

        // merge

        voxel += blockOffset;

        float weightOld = convert_float(voxelsWeights[voxel]) / 255.0f;

        float weightNew = 0.1f;

        float weightSum = weightOld + weightNew;

        if (weightNew > 0.001f)

        {

            float distOld = convert_float(voxels[voxel]) / 64.0f;

            float distNew = (distOld * weightOld + dist * weightNew) / weightSum;

 

            voxels[voxel]        = convert_char_sat_rte(distNew * 64.0f);

            voxelsWeights[voxel] = convert_uchar_sat_rte(weightNew * 255.0);

        }

    }

}

 

Application crashes somewhere in opencl internals. It crashes even in AMD App KernelAnalyzer. Commenting the following lines solves the problem:

           

voxels[voxel]        = convert_char_sat_rte(distNew * 64.0f);

voxelsWeights[voxel] = convert_uchar_sat_rte(weightNew * 255.0);

 

This kernel compiles and works on NVidia. Thanks in advance.

Outcomes