Hello dear ATI
Short version:
I think i have found a bug in your OpenCL compiler. My kernel contains no errors, compiles and works perfectly on GTX 480. But when using HD 5970, it hangs my application at clBuildProgram call. Stream KernelAnalyzer 1.6 also hangs when trying to analyze it.
Long version:
The symptoms:
To achieve the problem you should uncomment both commented code sections in the lower part of the code.
The kernel will work if you uncomment only one section (no difference first or second)
My configuration
(Don't know whether it is required) additional info about the kernel:
#define SS (2) #define IR (2) #define IC (2) #define OR (2) #define OC (2) #define Spicies (128) #define Rectangles (1) #define RectangleSize (2) #define DataRecords (128) #define local_work_size (Rectangles << 4) #define global_work_size (Spicies << 4) #define NoF(X) (X) #define RegularF(X) (1.0F / (1.0F + exp(-(X)))) #define NoOffset(X, B) (X) #define RegularOffset(X, B) ((X) + (B)) #define Cast(Object, Type) ((Type)(Object)) typedef struct { int _00, _01, _02, _03, _10, _11, _12, _13, _20, _21, _22, _23, _30, _31, _32, _33; } int4x4; typedef struct { float _00, _01, _02, _03, _10, _11, _12, _13, _20, _21, _22, _23, _30, _31, _32, _33; } float4x4; typedef int4x4 PL[IR][IC][Spicies << 4]; typedef float4x4 PW[OR][OC][Spicies << 4]; typedef struct { float o[DataRecords]; float h[DataRecords]; float l[DataRecords]; float c[DataRecords]; } Data; typedef struct { float4x4 l [SS][IR][Spicies << 2]; float4x4 r [OR][SS][Spicies << 2]; float4x4 o [SS][SS][Spicies << 2]; } GI; typedef struct { float4x4 l [SS][SS][Spicies << 2]; float4x4 r [SS][SS][Spicies << 2]; float4x4 o [SS][SS][Spicies << 2]; } GS; typedef struct { float4x4 l [IR][SS][Spicies << 2]; float4x4 r [SS][OR][Spicies << 2]; float4x4 o [IR][OR][Spicies << 2]; } GN; typedef struct { float4x4 l [OR][SS][Spicies << 2]; float4x4 r [SS][OC][Spicies << 2]; float4x4 o [OR][OC][Spicies << 2]; } GH; typedef struct { float4x4 l [SS][IR][Spicies << 2]; float4x4 i [IR][OR][Spicies << 2]; float4x4 t [SS][OR][Spicies << 2]; float4x4 r [OR][SS][Spicies << 2]; float4x4 o [SS][SS][Spicies << 2]; } PI; typedef struct { float4x4 l [SS][SS][Spicies << 2]; float4x4 i [SS][SS][Spicies << 2]; float4x4 t [SS][SS][Spicies << 2]; float4x4 r [SS][SS][Spicies << 2]; float4x4 o [SS][SS][Spicies << 2]; } PS; typedef struct { float4x4 l [IR][SS][Spicies << 2]; float4x4 i [SS][SS][Spicies << 2]; float4x4 t [IR][SS][Spicies << 2]; float4x4 r [SS][OR][Spicies << 2]; float4x4 o [IR][OR][Spicies << 2]; } PN; typedef struct { float4x4 l [OR][SS][Spicies << 2]; float4x4 i [SS][SS][Spicies << 2]; float4x4 t [OR][SS][Spicies << 2]; float4x4 r [SS][OC][Spicies << 2]; float4x4 o [OR][OC][Spicies << 2]; } PH; typedef struct { PI io; PI ih; PI il; PI ic; PS st; PN nr; PH hb; PH hs; PL lc; PW wb; PW ws; PW wc; } P; __kernel void Kernel(__global P *p) { unsigned int local_id = get_local_id(0); unsigned int global_id = get_global_id(0); unsigned int rectangleLocal = local_id / (RectangleSize * RectangleSize); unsigned int rectangleGlobal = local_id - rectangleLocal * (RectangleSize * RectangleSize); unsigned int rectangleRow = rectangleGlobal / RectangleSize; unsigned int rectangleCol = rectangleGlobal - RectangleSize * rectangleRow; rectangleGlobal = global_id / (RectangleSize * RectangleSize); __local float4x4 U[Rectangles][RectangleSize][RectangleSize]; __local float4x4 V[Rectangles][RectangleSize][RectangleSize]; __local float4x4 *U_flat = Cast(U, __local float4x4*); __local float4x4 *V_flat = Cast(V, __local float4x4*); barrier(CLK_LOCAL_MEM_FENCE); for (int cycle = 0; cycle < 10000; cycle++) { for (int row = 0; row < SS; row++) { for (int col = 0; col < IC; col++) { float4x4 sum = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; for (int component = 0; component < IR; component++) { U_flat[local_id] = p->io.l[row][component][global_id]; V_flat[local_id] = p->io.i[component][col][global_id]; barrier(CLK_LOCAL_MEM_FENCE); for (unsigned int i = 0; i < RectangleSize; i++) { float4x4 u = U[rectangleLocal][rectangleRow]; float4x4 v = V[rectangleLocal][rectangleCol]; sum._00 += u._00 * v._00 + u._01 * v._10 + u._02 * v._20 + u._03 * v._30; sum._01 += u._00 * v._01 + u._01 * v._11 + u._02 * v._21 + u._03 * v._31; sum._02 += u._00 * v._02 + u._01 * v._12 + u._02 * v._22 + u._03 * v._32; sum._03 += u._00 * v._03 + u._01 * v._13 + u._02 * v._23 + u._03 * v._33; sum._10 += u._10 * v._00 + u._11 * v._10 + u._12 * v._20 + u._13 * v._30; sum._11 += u._10 * v._01 + u._11 * v._11 + u._12 * v._21 + u._13 * v._31; sum._12 += u._10 * v._02 + u._11 * v._12 + u._12 * v._22 + u._13 * v._32; sum._13 += u._10 * v._03 + u._11 * v._13 + u._12 * v._23 + u._13 * v._33; sum._20 += u._20 * v._00 + u._21 * v._10 + u._22 * v._20 + u._23 * v._30; sum._21 += u._20 * v._01 + u._21 * v._11 + u._22 * v._21 + u._23 * v._31; sum._22 += u._20 * v._02 + u._21 * v._12 + u._22 * v._22 + u._23 * v._32; sum._23 += u._20 * v._03 + u._21 * v._13 + u._22 * v._23 + u._23 * v._33; sum._30 += u._30 * v._00 + u._31 * v._10 + u._32 * v._20 + u._33 * v._30; sum._31 += u._30 * v._01 + u._31 * v._11 + u._32 * v._21 + u._33 * v._31; sum._32 += u._30 * v._02 + u._31 * v._12 + u._32 * v._22 + u._33 * v._32; sum._33 += u._30 * v._03 + u._31 * v._13 + u._32 * v._23 + u._33 * v._33; } barrier(CLK_LOCAL_MEM_FENCE); } //p->io.t[row][col][global_id]._00 = sum._00; //p->io.t[row][col][global_id]._01 = sum._01; //p->io.t[row][col][global_id]._02 = sum._02; //p->io.t[row][col][global_id]._03 = sum._03; // //p->io.t[row][col][global_id]._10 = sum._10; //p->io.t[row][col][global_id]._11 = sum._11; //p->io.t[row][col][global_id]._12 = sum._12; //p->io.t[row][col][global_id]._13 = sum._13; // //p->io.t[row][col][global_id]._20 = sum._20; //p->io.t[row][col][global_id]._21 = sum._21; //p->io.t[row][col][global_id]._22 = sum._22; //p->io.t[row][col][global_id]._23 = sum._23; // --------------------------------------------------- //p->io.t[row][col][global_id]._30 = sum._30; //p->io.t[row][col][global_id]._31 = sum._31; //p->io.t[row][col][global_id]._32 = sum._32; //p->io.t[row][col][global_id]._33 = sum._33; } } } } ?
What is the official way to post a bug to ATI?
Ok, thanks
Great thanks!
By the way, regarding the problem, I haven't understood whether it was the problem with the compiler which was fixed in the upcoming release, or whether you didn't confirm that the problem existed. Have you tried uncommenting both commented sections in the bottom of the code?