I encountered problem with AMD card when trying to compile the following kernel:
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
__constant sampler_t proj_sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
// common
float4 transform(float16 mat, float4 vec)
{
float4 res;
res.x = dot(mat.s0123, vec);
res.y = dot(mat.s4567, vec);
res.z = dot(mat.s89AB, vec);
res.w = dot(mat.sCDEF, vec);
return res;
}
struct block_desc
{
int3 blockCoord; /// active block
int3 blockCount; /// num blocks in each direction
float3 offset; /// offset of the block in world coordinates
int3 blockSize; /// size of the voxel grid inside block
int3 blockSizeShift; /// helper to get voxel coordinate by voxel index
int3 blockSizeMask; /// helper to get voxel coordinate by voxel index
float3 voxelSize; /// size of the voxel of the grid
};
// compute position in 3d grid from 1d index
// only works for power of 2 sizes
int3 calcGridPos(uint i,
int3 gridSizeShift,
int3 gridSizeMask)
{
int3 gridPos;
gridPos.x = (i & gridSizeMask.x);
gridPos.y = ((i >> gridSizeShift.y) & gridSizeMask.y);
gridPos.z = ((i >> gridSizeShift.z) & gridSizeMask.z);
return gridPos;
}
__kernel
void updateDistanceFieldProj(__read_only image2d_t normalDepthMap,
uint numOccupiedVoxels,
uint blockOffset,
float16 modelViewMatrix,
float16 projMatrix,
float16 invProjMatrix,
struct block_desc block,
__global char* voxels,
__global uchar* voxelsWeights)
{
// compute position in 3d grid
uint voxel = min(get_global_id(0), numOccupiedVoxels - 1);
int3 gridPos = calcGridPos(voxel, block.blockSizeShift, block.blockSizeMask);
float4 cellCenter = (float4)(block.offset + (convert_float3(gridPos) + 0.5f) * block.voxelSize, 1.0f);
cellCenter = transform(modelViewMatrix, cellCenter);
// project in the texture
float4 projected = transform(projMatrix, cellCenter);
float2 tcoord = 0.5f * projected.xy / projected.w + 0.5f;
float depth = read_imagef(normalDepthMap, proj_sampler, tcoord).w;
// if something is in depth map update distance field
if (depth < 1.0f)
{
// reconstruct position in view space
projected.xy /= projected.w;
projected.z = depth;
projected.w = 1.0f;
float4 pointOnSurface = transform(invProjMatrix, projected);
pointOnSurface.xyz /= pointOnSurface.w;
// use simple distance metric, may be it is better to replace it to point-to-plane
float dist = (cellCenter.z - pointOnSurface.z) * 0.3f / block.voxelSize.x;
// merge
voxel += blockOffset;
float weightOld = convert_float(voxelsWeights[voxel]) / 255.0f;
float weightNew = 0.1f;
float weightSum = weightOld + weightNew;
if (weightNew > 0.001f)
{
float distOld = convert_float(voxels[voxel]) / 64.0f;
float distNew = (distOld * weightOld + dist * weightNew) / weightSum;
voxels[voxel] = convert_char_sat_rte(distNew * 64.0f);
voxelsWeights[voxel] = convert_uchar_sat_rte(weightNew * 255.0);
}
}
}
Application crashes somewhere in opencl internals. It crashes even in AMD App KernelAnalyzer. Commenting the following lines solves the problem:
voxels[voxel] = convert_char_sat_rte(distNew * 64.0f);
voxelsWeights[voxel] = convert_uchar_sat_rte(weightNew * 255.0);
This kernel compiles and works on NVidia. Thanks in advance.