5 Replies Latest reply on Oct 31, 2010 7:47 AM by lu4

    Seems like an OpenCL compiler bug.

    lu4

      Hello dear ATI  

      Short version:

      I think i have found a bug in your OpenCL compiler. My kernel contains no errors, compiles and works perfectly on GTX 480. But when using HD 5970, it hangs my application at clBuildProgram call. Stream KernelAnalyzer 1.6 also hangs when trying to analyze it.

       

      Long version:

      The symptoms:

       

      1. Kernel works perfectly on NVidia GTX 480.
      2. Using HD 5970 my program hangs over the clBuildProgram call.
      3. Kernel argument p is of size 7340032 bytes
      4. Kernel is launched with global work size = 512, local work size = 4
      5. It also hangs the Stream KernelAnalyzer 1.6 when trying to analyze it.

       

      To achieve the problem you should uncomment both commented code sections in the lower part of the code.

      The kernel will work if you uncomment only one section (no difference first or second)

      My configuration

       

      1. Windows 7 32 bit.
      2. Asus HD 5970,
      3. ati-stream-sdk-v2.2-vista-win7-32.exe
      4. 10-9_vista32_win7_32_dd_ccc_enu.exe

       

      (Don't know whether it is required) additional info about the kernel:

       

      1. This is ordinary matrix multiplication in the middle stage of development (the kernel makes many small matrices to get multiplied) p->io.l gets multiplied over the p->io.i and the result is put to p->io.t
      2. Matrices consist of 4 rectangles (1 matrix = 2x2 rectangles).
      3. Each rectangle consists of 4 tiles (1 rectangle = 2x2 tiles).
      4. Each tile consists of 16 floats (1 tile = 4x4 floats)
      5. Each tile is processed by the work item
      6. Each rectangle is put in to a local memory

       

       

      #define SS (2) #define IR (2) #define IC (2) #define OR (2) #define OC (2) #define Spicies (128) #define Rectangles (1) #define RectangleSize (2) #define DataRecords (128) #define local_work_size (Rectangles << 4) #define global_work_size (Spicies << 4) #define NoF(X) (X) #define RegularF(X) (1.0F / (1.0F + exp(-(X)))) #define NoOffset(X, B) (X) #define RegularOffset(X, B) ((X) + (B)) #define Cast(Object, Type) ((Type)(Object)) typedef struct { int _00, _01, _02, _03, _10, _11, _12, _13, _20, _21, _22, _23, _30, _31, _32, _33; } int4x4; typedef struct { float _00, _01, _02, _03, _10, _11, _12, _13, _20, _21, _22, _23, _30, _31, _32, _33; } float4x4; typedef int4x4 PL[IR][IC][Spicies << 4]; typedef float4x4 PW[OR][OC][Spicies << 4]; typedef struct { float o[DataRecords]; float h[DataRecords]; float l[DataRecords]; float c[DataRecords]; } Data; typedef struct { float4x4 l [SS][IR][Spicies << 2]; float4x4 r [OR][SS][Spicies << 2]; float4x4 o [SS][SS][Spicies << 2]; } GI; typedef struct { float4x4 l [SS][SS][Spicies << 2]; float4x4 r [SS][SS][Spicies << 2]; float4x4 o [SS][SS][Spicies << 2]; } GS; typedef struct { float4x4 l [IR][SS][Spicies << 2]; float4x4 r [SS][OR][Spicies << 2]; float4x4 o [IR][OR][Spicies << 2]; } GN; typedef struct { float4x4 l [OR][SS][Spicies << 2]; float4x4 r [SS][OC][Spicies << 2]; float4x4 o [OR][OC][Spicies << 2]; } GH; typedef struct { float4x4 l [SS][IR][Spicies << 2]; float4x4 i [IR][OR][Spicies << 2]; float4x4 t [SS][OR][Spicies << 2]; float4x4 r [OR][SS][Spicies << 2]; float4x4 o [SS][SS][Spicies << 2]; } PI; typedef struct { float4x4 l [SS][SS][Spicies << 2]; float4x4 i [SS][SS][Spicies << 2]; float4x4 t [SS][SS][Spicies << 2]; float4x4 r [SS][SS][Spicies << 2]; float4x4 o [SS][SS][Spicies << 2]; } PS; typedef struct { float4x4 l [IR][SS][Spicies << 2]; float4x4 i [SS][SS][Spicies << 2]; float4x4 t [IR][SS][Spicies << 2]; float4x4 r [SS][OR][Spicies << 2]; float4x4 o [IR][OR][Spicies << 2]; } PN; typedef struct { float4x4 l [OR][SS][Spicies << 2]; float4x4 i [SS][SS][Spicies << 2]; float4x4 t [OR][SS][Spicies << 2]; float4x4 r [SS][OC][Spicies << 2]; float4x4 o [OR][OC][Spicies << 2]; } PH; typedef struct { PI io; PI ih; PI il; PI ic; PS st; PN nr; PH hb; PH hs; PL lc; PW wb; PW ws; PW wc; } P; __kernel void Kernel(__global P *p) { unsigned int local_id = get_local_id(0); unsigned int global_id = get_global_id(0); unsigned int rectangleLocal = local_id / (RectangleSize * RectangleSize); unsigned int rectangleGlobal = local_id - rectangleLocal * (RectangleSize * RectangleSize); unsigned int rectangleRow = rectangleGlobal / RectangleSize; unsigned int rectangleCol = rectangleGlobal - RectangleSize * rectangleRow; rectangleGlobal = global_id / (RectangleSize * RectangleSize); __local float4x4 U[Rectangles][RectangleSize][RectangleSize]; __local float4x4 V[Rectangles][RectangleSize][RectangleSize]; __local float4x4 *U_flat = Cast(U, __local float4x4*); __local float4x4 *V_flat = Cast(V, __local float4x4*); barrier(CLK_LOCAL_MEM_FENCE); for (int cycle = 0; cycle < 10000; cycle++) { for (int row = 0; row < SS; row++) { for (int col = 0; col < IC; col++) { float4x4 sum = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; for (int component = 0; component < IR; component++) { U_flat[local_id] = p->io.l[row][component][global_id]; V_flat[local_id] = p->io.i[component][col][global_id]; barrier(CLK_LOCAL_MEM_FENCE); for (unsigned int i = 0; i < RectangleSize; i++) { float4x4 u = U[rectangleLocal][rectangleRow][i]; float4x4 v = V[rectangleLocal][i][rectangleCol]; sum._00 += u._00 * v._00 + u._01 * v._10 + u._02 * v._20 + u._03 * v._30; sum._01 += u._00 * v._01 + u._01 * v._11 + u._02 * v._21 + u._03 * v._31; sum._02 += u._00 * v._02 + u._01 * v._12 + u._02 * v._22 + u._03 * v._32; sum._03 += u._00 * v._03 + u._01 * v._13 + u._02 * v._23 + u._03 * v._33; sum._10 += u._10 * v._00 + u._11 * v._10 + u._12 * v._20 + u._13 * v._30; sum._11 += u._10 * v._01 + u._11 * v._11 + u._12 * v._21 + u._13 * v._31; sum._12 += u._10 * v._02 + u._11 * v._12 + u._12 * v._22 + u._13 * v._32; sum._13 += u._10 * v._03 + u._11 * v._13 + u._12 * v._23 + u._13 * v._33; sum._20 += u._20 * v._00 + u._21 * v._10 + u._22 * v._20 + u._23 * v._30; sum._21 += u._20 * v._01 + u._21 * v._11 + u._22 * v._21 + u._23 * v._31; sum._22 += u._20 * v._02 + u._21 * v._12 + u._22 * v._22 + u._23 * v._32; sum._23 += u._20 * v._03 + u._21 * v._13 + u._22 * v._23 + u._23 * v._33; sum._30 += u._30 * v._00 + u._31 * v._10 + u._32 * v._20 + u._33 * v._30; sum._31 += u._30 * v._01 + u._31 * v._11 + u._32 * v._21 + u._33 * v._31; sum._32 += u._30 * v._02 + u._31 * v._12 + u._32 * v._22 + u._33 * v._32; sum._33 += u._30 * v._03 + u._31 * v._13 + u._32 * v._23 + u._33 * v._33; } barrier(CLK_LOCAL_MEM_FENCE); } //p->io.t[row][col][global_id]._00 = sum._00; //p->io.t[row][col][global_id]._01 = sum._01; //p->io.t[row][col][global_id]._02 = sum._02; //p->io.t[row][col][global_id]._03 = sum._03; // //p->io.t[row][col][global_id]._10 = sum._10; //p->io.t[row][col][global_id]._11 = sum._11; //p->io.t[row][col][global_id]._12 = sum._12; //p->io.t[row][col][global_id]._13 = sum._13; // //p->io.t[row][col][global_id]._20 = sum._20; //p->io.t[row][col][global_id]._21 = sum._21; //p->io.t[row][col][global_id]._22 = sum._22; //p->io.t[row][col][global_id]._23 = sum._23; // --------------------------------------------------- //p->io.t[row][col][global_id]._30 = sum._30; //p->io.t[row][col][global_id]._31 = sum._31; //p->io.t[row][col][global_id]._32 = sum._32; //p->io.t[row][col][global_id]._33 = sum._33; } } } } ?