Hi
I'm trying to squeeze more out of my kernel by performing synchronisation and using atomic instructions on shared memory. I'm queuing some 32768 work units but only want the results of 32767 of them. The problem I'm experiencing is a "calclCompile failedError: Creating kernel KERNELNAME failed!" error, which brings me to the current state of my code:
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#define DATA_SIZE 588
<snip>
inline void atomicStat(__local ulong *source, ulong8 data, ulong data2)
{
// Uncommenting any of the below causes the compile to fail with the above message
// atom_add(source, data.s0);
// atom_add(source + 1, data.s1);
// atom_add(source + 2, data.s2);
// atom_add(source + 3, data.s3);
// atom_add(source + 4, data.s4);
// atom_add(source + 5, data.s5);
// atom_add(source + 6, data.s6);
// atom_add(source + 7, data.s7);
// atom_inc(source + 8 + data.s0);
// atom_inc(source + 14 + data.s1);
// atom_inc(source + 20 + data.s2);
// atom_inc(source + 26 + data.s3);
// atom_inc(source + 32 + data.s4);
// atom_inc(source + 38 + data.s5);
// atom_inc(source + 44 + data.s6);
// atom_inc(source + 50 + data.s7);
// atom_inc(source + 56 + data2);
}
__kernel void gnmntAvN8p5(__global int *inVec,
__local ulong *outBuffer,
__global ulong *outBuffer2)
{
ulong8 diff;
ulong sumBits;
int lid = get_local_id(0);
int gid = get_group_id(0);
int gbl = get_global_id(0);
int x;
// Initialise if new work group
if(lid == 0)
{
sumBits = 0;
// This could be done using one loop at the cost of readability
for(x = 0; x < 4; x++)
{
vstore2((ulong2)(sumBits, sumBits), x, outBuffer);
vstore2((ulong2)(sumBits, sumBits), x, outBuffer + DATA_SIZE);
vstore2((ulong2)(sumBits, sumBits), x, outBuffer + (2 * DATA_SIZE));
vstore2((ulong2)(sumBits, sumBits), x, outBuffer + (3 * DATA_SIZE));
vstore2((ulong2)(sumBits, sumBits), x, outBuffer + (4 * DATA_SIZE));
vstore2((ulong2)(sumBits, sumBits), x, outBuffer + (5 * DATA_SIZE));
}
for(x = 0; x < 24; x++)
{
vstore2((ulong2)(sumBits, sumBits), x, outBuffer + 8);
vstore2((ulong2)(sumBits, sumBits), x, outBuffer + 8 + DATA_SIZE);
vstore2((ulong2)(sumBits, sumBits), x, outBuffer + 8 + (2 * DATA_SIZE));
vstore2((ulong2)(sumBits, sumBits), x, outBuffer + 8 + (3 * DATA_SIZE));
vstore2((ulong2)(sumBits, sumBits), x, outBuffer + 8 + (4 * DATA_SIZE));
vstore2((ulong2)(sumBits, sumBits), x, outBuffer + 8 + (5 * DATA_SIZE));
}
for(x = 0; x < 21; x++)
{
vstore2((ulong2)(sumBits, sumBits), x, outBuffer + 56);
vstore2((ulong2)(sumBits, sumBits), x, outBuffer + 56 + DATA_SIZE);
vstore2((ulong2)(sumBits, sumBits), x, outBuffer + 56 + (2 * DATA_SIZE));
vstore2((ulong2)(sumBits, sumBits), x, outBuffer + 56 + (3 * DATA_SIZE));
vstore2((ulong2)(sumBits, sumBits), x, outBuffer + 56 + (4 * DATA_SIZE));
vstore2((ulong2)(sumBits, sumBits), x, outBuffer + 56 + (5 * DATA_SIZE));
}
}
barrier(CLK_LOCAL_MEM_FENCE); // Synchronise initialisation
<snip>
barrier(CLK_LOCAL_MEM_FENCE);
switch(scenario)
{
case 0:
if((gid < 127) || (lid < 127)) // don't update the 32768th instance
atomicStat(outBuffer + (0 * DATA_SIZE), diff, sumBits);
break;
case 1:
if((gid < 127) || (lid < 127)) // don't update the 32768th instance
atomicStat(outBuffer + (1 * DATA_SIZE), diff, sumBits);
break;
default:
if((gid < 127) || (lid < 127)) // don't update the 32768th instance
atomicStat(outBuffer + (2 * DATA_SIZE), diff, sumBits);
break;
}
<snip>
// I would then do local reduction here before storing to global memory
barrier(CLK_GLOBAL_MEM_FENCE);
// etc
}
Any help how to resolve this problem would be greatly appreciated.
OS: Win7 x64
IDE: Visual Studio 2010
GPU: 7970
SDK: AMD SDK 2.7
CCC: 12.8
Solved! Go to Solution.
I managed to resolve the issue by updating CCC to 12.10 and I also installed CodeXL, which contains version 2 of Kernel Analyzer. Now I just have to deal with all the mangled code I'm left with at the all the attempts to track and resolve this problem.
While cl_khr_int64_base_atomics is not an extension reported by the device, it can be invoked by sending -Dcl_khr_int64_base_atomics as a prameter when calling clBuildProgram and used within your code after declaring #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
Hope that helps you too.
I too have a 7970 and my device does not show cl_khr_int64_base_atomics extension as supported. From OpenCL spec, it seems it is a optional extension, which is currently not supported by AMD cards.
Maybe you should try 32-bit atomics. Does your code work on CPU? CPU shows up the extension as supported.
MY CLINFO OUTPUT:
Name: Tahiti
Vendor: Advanced Micro Devices, Inc.
Device OpenCL C version: OpenCL C 1.2
Driver version: CAL 1.4.1741 (VM)
Profile: FULL_PROFILE
Version: OpenCL 1.2 AMD-APP (938.2)
Extensions: cl_khr_fp64 cl_amd_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_gl_sharing cl_ext_atomic_counters_32 cl_amd_device_attribute_query cl_amd_vec3 cl_amd_printf cl_amd_media_ops cl_amd_popcnt
I managed to resolve the issue by updating CCC to 12.10 and I also installed CodeXL, which contains version 2 of Kernel Analyzer. Now I just have to deal with all the mangled code I'm left with at the all the attempts to track and resolve this problem.
While cl_khr_int64_base_atomics is not an extension reported by the device, it can be invoked by sending -Dcl_khr_int64_base_atomics as a prameter when calling clBuildProgram and used within your code after declaring #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
Hope that helps you too.