AnsweredAssumed Answered

Problem compiling with atom_add and atom_inc (64 bit)

Question asked by ankhster on Nov 11, 2012
Latest reply on Nov 12, 2012 by ankhster

Hi

 

I'm trying to squeeze more out of my kernel by performing synchronisation and using atomic instructions on shared memory. I'm queuing some 32768 work units but only want the results of 32767 of them. The problem I'm experiencing is a "calclCompile failedError: Creating kernel KERNELNAME failed!" error, which brings me to the current state of my code:

 

#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#define DATA_SIZE 588

<snip>

inline void    atomicStat(__local ulong *source, ulong8 data, ulong data2)
{
    // Uncommenting any of the below causes the compile to fail with the above message
//    atom_add(source, data.s0);
//    atom_add(source + 1, data.s1);
//    atom_add(source + 2, data.s2);
//    atom_add(source + 3, data.s3);
//    atom_add(source + 4, data.s4);
//    atom_add(source + 5, data.s5);
//    atom_add(source + 6, data.s6);
//    atom_add(source + 7, data.s7);
//    atom_inc(source + 8 + data.s0);
//    atom_inc(source + 14 + data.s1);
//    atom_inc(source + 20 + data.s2);
//    atom_inc(source + 26 + data.s3);
//    atom_inc(source + 32 + data.s4);
//    atom_inc(source + 38 + data.s5);
//    atom_inc(source + 44 + data.s6);
//    atom_inc(source + 50 + data.s7);
//    atom_inc(source + 56 + data2);
}

__kernel void gnmntAvN8p5(__global int *inVec,
                                                     __local ulong *outBuffer,
                                                     __global ulong *outBuffer2)
{

    ulong8    diff;
    ulong    sumBits;

    int    lid = get_local_id(0);
    int    gid = get_group_id(0);
    int    gbl = get_global_id(0);

    int    x;

    // Initialise if new work group
    if(lid == 0)
    {
        sumBits = 0;
        // This could be done using one loop at the cost of readability
        for(x = 0; x < 4; x++)
        {
            vstore2((ulong2)(sumBits, sumBits), x, outBuffer);
            vstore2((ulong2)(sumBits, sumBits), x, outBuffer + DATA_SIZE);
            vstore2((ulong2)(sumBits, sumBits), x, outBuffer + (2 * DATA_SIZE));
            vstore2((ulong2)(sumBits, sumBits), x, outBuffer + (3 * DATA_SIZE));
            vstore2((ulong2)(sumBits, sumBits), x, outBuffer + (4 * DATA_SIZE));
            vstore2((ulong2)(sumBits, sumBits), x, outBuffer + (5 * DATA_SIZE));
        }
        for(x = 0; x < 24; x++)
        {
            vstore2((ulong2)(sumBits, sumBits), x, outBuffer + 8);
            vstore2((ulong2)(sumBits, sumBits), x, outBuffer + 8 + DATA_SIZE);
            vstore2((ulong2)(sumBits, sumBits), x, outBuffer + 8 + (2 * DATA_SIZE));
            vstore2((ulong2)(sumBits, sumBits), x, outBuffer + 8 + (3 * DATA_SIZE));
            vstore2((ulong2)(sumBits, sumBits), x, outBuffer + 8 + (4 * DATA_SIZE));
            vstore2((ulong2)(sumBits, sumBits), x, outBuffer + 8 + (5 * DATA_SIZE));
        }
        for(x = 0; x < 21; x++)
        {
            vstore2((ulong2)(sumBits, sumBits), x, outBuffer + 56);
            vstore2((ulong2)(sumBits, sumBits), x, outBuffer + 56 + DATA_SIZE);
            vstore2((ulong2)(sumBits, sumBits), x, outBuffer + 56 + (2 * DATA_SIZE));
            vstore2((ulong2)(sumBits, sumBits), x, outBuffer + 56 + (3 * DATA_SIZE));
            vstore2((ulong2)(sumBits, sumBits), x, outBuffer + 56 + (4 * DATA_SIZE));
            vstore2((ulong2)(sumBits, sumBits), x, outBuffer + 56 + (5 * DATA_SIZE));
        }
    }

    barrier(CLK_LOCAL_MEM_FENCE); // Synchronise initialisation

<snip>

    barrier(CLK_LOCAL_MEM_FENCE);
    switch(scenario)
    {
        case 0:
            if((gid < 127) || (lid < 127)) // don't update the 32768th instance
                atomicStat(outBuffer + (0 * DATA_SIZE), diff, sumBits);
            break;
        case 1:
            if((gid < 127) || (lid < 127)) // don't update the 32768th instance
                atomicStat(outBuffer + (1 * DATA_SIZE), diff, sumBits);
            break;
        default:
            if((gid < 127) || (lid < 127)) // don't update the 32768th instance
                atomicStat(outBuffer + (2 * DATA_SIZE), diff, sumBits);
            break;
    }

<snip>

    // I would then do local reduction here before storing to global memory
    barrier(CLK_GLOBAL_MEM_FENCE);
    // etc
}

 

Any help how to resolve this problem would be greatly appreciated.

 

OS:    Win7 x64

IDE:    Visual Studio 2010

GPU:    7970

SDK:    AMD SDK 2.7

CCC:    12.8

Outcomes