cancel
Showing results for 
Search instead for 
Did you mean: 

Archives Discussions

lu4
Journeyman III

Seems like an OpenCL compiler bug.

Hello dear ATI  

Short version:

I think i have found a bug in your OpenCL compiler. My kernel contains no errors, compiles and works perfectly on GTX 480. But when using HD 5970, it hangs my application at clBuildProgram call. Stream KernelAnalyzer 1.6 also hangs when trying to analyze it.

 

Long version:

The symptoms:

 

  1. Kernel works perfectly on NVidia GTX 480.
  2. Using HD 5970 my program hangs over the clBuildProgram call.
  3. Kernel argument p is of size 7340032 bytes
  4. Kernel is launched with global work size = 512, local work size = 4
  5. It also hangs the Stream KernelAnalyzer 1.6 when trying to analyze it.

 

To achieve the problem you should uncomment both commented code sections in the lower part of the code.

The kernel will work if you uncomment only one section (no difference first or second)

My configuration

 

  1. Windows 7 32 bit.
  2. Asus HD 5970,
  3. ati-stream-sdk-v2.2-vista-win7-32.exe
  4. 10-9_vista32_win7_32_dd_ccc_enu.exe

 

(Don't know whether it is required) additional info about the kernel:

 

  1. This is ordinary matrix multiplication in the middle stage of development (the kernel makes many small matrices to get multiplied) p->io.l gets multiplied over the p->io.i and the result is put to p->io.t
  2. Matrices consist of 4 rectangles (1 matrix = 2x2 rectangles).
  3. Each rectangle consists of 4 tiles (1 rectangle = 2x2 tiles).
  4. Each tile consists of 16 floats (1 tile = 4x4 floats)
  5. Each tile is processed by the work item
  6. Each rectangle is put in to a local memory

 

 

#define SS (2) #define IR (2) #define IC (2) #define OR (2) #define OC (2) #define Spicies (128) #define Rectangles (1) #define RectangleSize (2) #define DataRecords (128) #define local_work_size (Rectangles << 4) #define global_work_size (Spicies << 4) #define NoF(X) (X) #define RegularF(X) (1.0F / (1.0F + exp(-(X)))) #define NoOffset(X, B) (X) #define RegularOffset(X, B) ((X) + (B)) #define Cast(Object, Type) ((Type)(Object)) typedef struct { int _00, _01, _02, _03, _10, _11, _12, _13, _20, _21, _22, _23, _30, _31, _32, _33; } int4x4; typedef struct { float _00, _01, _02, _03, _10, _11, _12, _13, _20, _21, _22, _23, _30, _31, _32, _33; } float4x4; typedef int4x4 PL[IR][IC][Spicies << 4]; typedef float4x4 PW[OR][OC][Spicies << 4]; typedef struct { float o[DataRecords]; float h[DataRecords]; float l[DataRecords]; float c[DataRecords]; } Data; typedef struct { float4x4 l [SS][IR][Spicies << 2]; float4x4 r [OR][SS][Spicies << 2]; float4x4 o [SS][SS][Spicies << 2]; } GI; typedef struct { float4x4 l [SS][SS][Spicies << 2]; float4x4 r [SS][SS][Spicies << 2]; float4x4 o [SS][SS][Spicies << 2]; } GS; typedef struct { float4x4 l [IR][SS][Spicies << 2]; float4x4 r [SS][OR][Spicies << 2]; float4x4 o [IR][OR][Spicies << 2]; } GN; typedef struct { float4x4 l [OR][SS][Spicies << 2]; float4x4 r [SS][OC][Spicies << 2]; float4x4 o [OR][OC][Spicies << 2]; } GH; typedef struct { float4x4 l [SS][IR][Spicies << 2]; float4x4 i [IR][OR][Spicies << 2]; float4x4 t [SS][OR][Spicies << 2]; float4x4 r [OR][SS][Spicies << 2]; float4x4 o [SS][SS][Spicies << 2]; } PI; typedef struct { float4x4 l [SS][SS][Spicies << 2]; float4x4 i [SS][SS][Spicies << 2]; float4x4 t [SS][SS][Spicies << 2]; float4x4 r [SS][SS][Spicies << 2]; float4x4 o [SS][SS][Spicies << 2]; } PS; typedef struct { float4x4 l [IR][SS][Spicies << 2]; float4x4 i [SS][SS][Spicies << 2]; float4x4 t [IR][SS][Spicies << 2]; float4x4 r [SS][OR][Spicies << 2]; float4x4 o [IR][OR][Spicies << 2]; } PN; typedef struct { float4x4 l [OR][SS][Spicies << 2]; float4x4 i [SS][SS][Spicies << 2]; float4x4 t [OR][SS][Spicies << 2]; float4x4 r [SS][OC][Spicies << 2]; float4x4 o [OR][OC][Spicies << 2]; } PH; typedef struct { PI io; PI ih; PI il; PI ic; PS st; PN nr; PH hb; PH hs; PL lc; PW wb; PW ws; PW wc; } P; __kernel void Kernel(__global P *p) { unsigned int local_id = get_local_id(0); unsigned int global_id = get_global_id(0); unsigned int rectangleLocal = local_id / (RectangleSize * RectangleSize); unsigned int rectangleGlobal = local_id - rectangleLocal * (RectangleSize * RectangleSize); unsigned int rectangleRow = rectangleGlobal / RectangleSize; unsigned int rectangleCol = rectangleGlobal - RectangleSize * rectangleRow; rectangleGlobal = global_id / (RectangleSize * RectangleSize); __local float4x4 U[Rectangles][RectangleSize][RectangleSize]; __local float4x4 V[Rectangles][RectangleSize][RectangleSize]; __local float4x4 *U_flat = Cast(U, __local float4x4*); __local float4x4 *V_flat = Cast(V, __local float4x4*); barrier(CLK_LOCAL_MEM_FENCE); for (int cycle = 0; cycle < 10000; cycle++) { for (int row = 0; row < SS; row++) { for (int col = 0; col < IC; col++) { float4x4 sum = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; for (int component = 0; component < IR; component++) { U_flat[local_id] = p->io.l[row][component][global_id]; V_flat[local_id] = p->io.i[component][col][global_id]; barrier(CLK_LOCAL_MEM_FENCE); for (unsigned int i = 0; i < RectangleSize; i++) { float4x4 u = U[rectangleLocal][rectangleRow]; float4x4 v = V[rectangleLocal][rectangleCol]; sum._00 += u._00 * v._00 + u._01 * v._10 + u._02 * v._20 + u._03 * v._30; sum._01 += u._00 * v._01 + u._01 * v._11 + u._02 * v._21 + u._03 * v._31; sum._02 += u._00 * v._02 + u._01 * v._12 + u._02 * v._22 + u._03 * v._32; sum._03 += u._00 * v._03 + u._01 * v._13 + u._02 * v._23 + u._03 * v._33; sum._10 += u._10 * v._00 + u._11 * v._10 + u._12 * v._20 + u._13 * v._30; sum._11 += u._10 * v._01 + u._11 * v._11 + u._12 * v._21 + u._13 * v._31; sum._12 += u._10 * v._02 + u._11 * v._12 + u._12 * v._22 + u._13 * v._32; sum._13 += u._10 * v._03 + u._11 * v._13 + u._12 * v._23 + u._13 * v._33; sum._20 += u._20 * v._00 + u._21 * v._10 + u._22 * v._20 + u._23 * v._30; sum._21 += u._20 * v._01 + u._21 * v._11 + u._22 * v._21 + u._23 * v._31; sum._22 += u._20 * v._02 + u._21 * v._12 + u._22 * v._22 + u._23 * v._32; sum._23 += u._20 * v._03 + u._21 * v._13 + u._22 * v._23 + u._23 * v._33; sum._30 += u._30 * v._00 + u._31 * v._10 + u._32 * v._20 + u._33 * v._30; sum._31 += u._30 * v._01 + u._31 * v._11 + u._32 * v._21 + u._33 * v._31; sum._32 += u._30 * v._02 + u._31 * v._12 + u._32 * v._22 + u._33 * v._32; sum._33 += u._30 * v._03 + u._31 * v._13 + u._32 * v._23 + u._33 * v._33; } barrier(CLK_LOCAL_MEM_FENCE); } //p->io.t[row][col][global_id]._00 = sum._00; //p->io.t[row][col][global_id]._01 = sum._01; //p->io.t[row][col][global_id]._02 = sum._02; //p->io.t[row][col][global_id]._03 = sum._03; // //p->io.t[row][col][global_id]._10 = sum._10; //p->io.t[row][col][global_id]._11 = sum._11; //p->io.t[row][col][global_id]._12 = sum._12; //p->io.t[row][col][global_id]._13 = sum._13; // //p->io.t[row][col][global_id]._20 = sum._20; //p->io.t[row][col][global_id]._21 = sum._21; //p->io.t[row][col][global_id]._22 = sum._22; //p->io.t[row][col][global_id]._23 = sum._23; // --------------------------------------------------- //p->io.t[row][col][global_id]._30 = sum._30; //p->io.t[row][col][global_id]._31 = sum._31; //p->io.t[row][col][global_id]._32 = sum._32; //p->io.t[row][col][global_id]._33 = sum._33; } } } } ?

0 Likes
5 Replies
lu4
Journeyman III

What is the official way to post a bug to ATI?

0 Likes

Send a test case to streamdeveloper@amd.com showing your issue is the easiest way. Myself and other devs watch the forum and file bugs when we notice them for situations like this.
0 Likes

Ok, thanks

0 Likes

lu4,
I do not have any trouble with this kernel in the upcoming release.
0 Likes

Great thanks!

 

By the way, regarding the problem, I haven't understood whether it was the problem with the compiler which was fixed in the upcoming release, or whether you didn't confirm that the problem existed. Have you tried uncommenting both commented sections in the bottom of the code?

0 Likes