Test machine:
OS: 64-bit Windows 7
CPU: Intel Core 2 Duo E8400
GPU 1: AMD Radeon 6990 (dual Cayman) with AMD_Catalyst_11.5a_Hotfix_8.85.6RC2_Win7_May13
GPU 2: nVidia GeForce GTX 580 with 275.33
// BUG 1: AMD FAIL, NVIDIA OK
// Strangely, adding up some zeroes in a loop causes the entire function
// to become a NOP and nothing is written to *output.
// Tested on Cayman GPU (Radeon HD 6990): FAIL (output buffer not written)
// Tested on Geforce GTX 580: OK (0x600D written to output buffer)
__constant uint zeroes[16] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
__kernel void bug1(__global uint *output)
{
uint empty[16] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
*output = 0x600D; // this never happens on AMD
uint x = 0;
uint i = 0;
while (i < 16) {
x += empty + zeroes; i+=1;
x += empty + zeroes; i+=1;
x += empty + zeroes; i+=1;
x += empty + zeroes; i+=1;
x += empty + zeroes; i+=1;
x += empty + zeroes; i+=1;
x += empty + zeroes; i+=1;
x += empty + zeroes; i+=1;
}
if (x == 42) // keep optimizer from removing everything
*output = 0xBAD1;
}
// BUG 2A: AMD FAIL, NVIDIA FAIL
// Comparison with an uninitialized variable is always true.
// Please make "value used before set" an error, not a warning.
// You don't notice warnings when calling clBuildProgram()
// and what happens here makes no sense.
__kernel void bug2a(__global uint *output)
{
uint undefined; // not initialized
*output = 0x600D;
if (undefined == 42) // always true using any literal value
*output = 0xBAD2;
}
// BUG 2B: AMD FAIL, NVIDIA FAIL
// Assignments based on uninitialized variables turn
// the entire function into a NOP - it does nothing at all.
// Please make "value used before set" an error, not a warning.
// You don't notice warnings when calling clBuildProgram()
// and what happens here makes no sense.
__kernel void bug2b(__global uint *output)
{
uint test = 0;
*output = 0xBAD3; // strangely, even this assignment disappears
for (uint x; x < 1000; x++) // x not initialized - oops!
test += x;
*output = test; // this assignment disappears
}
// BUG 3: AMD FAIL, NVIDIA OK
// Trying to unroll a loop where the loop variable is modified
// inside the loop crashes the compiler.
__kernel void crash1()
{
#pragma unroll // AMD: comment out line to avoid compiler crash
for (uint i = 0; i < 10; i++)
i++;
}
// BUG 4: AMD FAIL, NVIDIA OK
// AMD: Reading far outside array bounds causes the compiler to crash
// if also a comparison is done on the value that was read.
__kernel void crash2(__global uint *output)
{
uint tiny[1] = { 0 };
uint x = 0;
*output = 0x600D;
for (uint i = 0; i < 25; i++) // AMD: 24 = OK, 25 = CRASH
x += tiny;
if (x == 42) // AMD: comment out line to avoid compiler crash
*output = x; // this line by itself won't cause a crash
}
// BUG 5: AMD FAIL, NVIDIA FAIL
// Putting too much code inside a loop crashes the compiler.
// Also it can get the compiler into an infinite loop or cause
// a situation where the compiler returns, but every call to OpenCL
// functions after that results in CL_INVALID_COMMAND_QUEUE or
// CL_OUT_OF_RESOURCES. I was not able to reproduce the 2nd and 3rd
// cases when writing example code, but this code causes
// the 1st case (crash). Behavior seems the same on AMD and NVIDIA.
__kernel void compilercrash(__global uint *output)
{
uint i1 = -1, i2 = 0, n = get_global_id(0), sum = 0;
for (uint loop = 0; loop < 1; loop++) {
// repeat the "i1 = .." line 2000 times
// not done here to keep example short
i1 = i1 ^ i2; i2 += i1; n++; // repeat me
sum += i1 + n;
}
*output = sum;
}
Oops - I didn't know the forum would mangle the code. But you are right, it was [ i ] that disappeared in the first example. If you already got all the examples running the way they should(n't) then I won't repeat all that code.
I tested the code on the two GPUs listed. I haven't tried it on CPUs.
Thanks - I'm looking forward to the next release.