7 Replies Latest reply on May 18, 2010 6:57 PM by omkaranathan

    OpenCL compiler is stuck

    jwbos

      Currently I am working on a project porting some pieces of existing code to OpenCL. I found some strange behaviour and am wondering if anyone else encountered this before.

      When compiling (on the fly) and building my GPU program using clBuildProgram the compiler is stuck in some infinite loop and never returns. When using the clc compiler directly there is no problem.

      The problem seems to be with a while loop in my (large) code. When I simply remove the while loop everything compiles instantly without troubles. The code is quite large, I don't know if this can cause any (additional) problems (binary size restrictions?).

      Any help appriciated! I can attach the code if needed.

        • OpenCL compiler is stuck
          omkaranathan

          Please post your code.

            • OpenCL compiler is stuck
              jwbos

              Thanks for the fast reply. See the code attached.

              If the while-loop at line 490 in the function mul_point is commented out the code compiles fine. Otherwise it is stuck.

              typedef uint u32; typedef struct { u32 x[7]; u32 y[7]; u32 z[7]; } Point; u32 genc (u32 a, u32 b) { return (hadd (a, b) >> 31); } u32 genb (u32 a, u32 b) { return (b > a); } void set (__private u32 *a, __private u32 *b) { a[0] = b[0]; a[1] = b[1]; a[2] = b[2]; a[3] = b[3]; a[4] = b[4]; a[5] = b[5]; a[6] = b[6]; } void mul (__private u32 *c, __private u32 *a, __private u32 *b) { u32 a0,a1,a2,a3,a4,a5,a6; u32 b0,b1,b2,b3,b4,b5,b6; u32 c0,c1,c2,c3,c4,c5,c6; u32 e1,e2,e3,e4,e5,e6,d110,d010; u32 d01,d02,d03,d04,d05,d06; u32 d11,d12,d13,d14,d15,d16,zero=0; a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3]; a4 = a[4]; a5 = a[5]; a6 = a[6]; b0 = b[0]; b1 = b[1]; b2 = b[2]; b3 = b[3]; b4 = b[4]; b5 = b[5]; b6 = b[6]; #define mulADD(d1,d0,a,b,c,extra) \ do { \ u32 _c1; \ d0 = a * b; \ _c1 = d0 + c; \ d1 = mul_hi (a, b) + genc (d0, c) + genc (_c1, extra); \ d0 = _c1 + extra; \ } while (0) #define mulADD2(d1,d0,a,b,c) \ do { \ d0 = a * b; \ d1 = mul_hi (a, b) + genc (d0, c); \ d0 = d0 + c; \ } while (0) #define R(i,j) \ mulADD2 (e1, c##i, a0, b##j, d11); \ mulADD (e2, d11, a1, b##j, e1, d12); \ mulADD (e3, d12, a2, b##j, e2, d13); \ mulADD (e4, d13, a3, b##j, e3, d14); \ mulADD (e5, d14, a4, b##j, e4, d15); \ mulADD (e6, d15, a5, b##j, e5, d16); \ mulADD (d110, d16, a6, b##j, e6, d110); mulADD2 (e1, c0, a0, b0, zero); mulADD2 (e2, d01, a1, b0, e1); mulADD2 (e3, d02, a2, b0, e2); mulADD2 (e4, d03, a3, b0, e3); mulADD2 (e5, d04, a4, b0, e4); mulADD2 (e6, d05, a5, b0, e5); mulADD2 (d010, d06, a6, b0, e6); mulADD2 (e1, c1, a0, b1, d01); mulADD (e2, d11, a1, b1, e1, d02); mulADD (e3, d12, a2, b1, e2, d03); mulADD (e4, d13, a3, b1, e3, d04); mulADD (e5, d14, a4, b1, e4, d05); mulADD (e6, d15, a5, b1, e5, d06); mulADD (d110, d16, a6, b1, e6, d010); R(2,2); R(3,3); R(4,4); R(5,5); R(6,6); c[0] = c0; c[1] = c1; c[2] = c2; c[3] = c3; c[4] = c4; c[5] = c5; c[6] = c6; c[7] = d11; c[8] = d12; c[9] = d13; c[10] = d14; c[11] = d15; c[12] = d16; c[13] = d110; } void reduce (__private u32 *d, __private u32 *c) { u32 a0,a1,a2,a3,a4,a5,a6,a7; u32 b0,b1,b2,b3,b4,b5,b6,b7; u32 c0,c1,c2,c3,c4,c5,c6,c7; u32 carry0,carry1,carry2,carry3,carry4,carry5,carry6,carry7,carry8,carry9; u32 borrow0,borrow1,borrow2,borrow3,borrow4,borrow5,borrow6; u32 mask; a0 = c[0]; a1 = c[1]; a2 = c[2]; a3 = c[3] + c[7] + c[11]; carry1 = genc (c[3], c[7]) + genc (c[3]+c[7], c[11]); a4 = c[4] + c[8] + c[12] + carry1; carry2 = genc (c[4], c[8]) + genc (c[4]+c[8], c[12]) + genc (c[4]+c[8]+c[12], carry1); a5 = c[5] + c[9] + c[13] + carry2; carry3 = genc (c[5], c[9]) + genc (c[5]+c[9], c[13]) + genc (c[5]+c[9]+c[13], carry2); a6 = c[6] + c[10] + carry3; a7 = genc (c[6], c[10]) + genc (c[6]+c[10], carry3); /* Compute s4 + s5 */ b0 = c[7] + c[11]; carry4 = genc (c[7], c[11]); b1 = c[8] + c[12] + carry4; carry5 = genc (c[8], c[12]) + genc (c[8] + c[12], carry4); b2 = c[9] + c[13] + carry5; carry6 = genc (c[9], c[13]) + genc (c[9]+c[13], carry5); b3 = c[10] + carry6; carry7 = genc (c[10], carry6); b4 = c[11] + carry7; carry8 = genc (c[11], carry7); b5 = c[12] + carry8; carry9 = genc (c[12], carry8); b6 = c[13] + carry9; b7 = genc (c[13], carry9); /* Compute 2*P224 - (s4 + s5) */ borrow0 = genb (2, b0); b0 = 2 - b0; borrow1 = genb (0, b1) + genb (0-b1, borrow0); b1 = -b1 - borrow0; borrow2 = genb (0, b2) + genb (0-b2, borrow1); b2 = -b2 - borrow1; borrow3 = genb (0xfffffffe, borrow2) + genb (0xfffffffe - borrow2, b3); b3 = 0xfffffffe - borrow2 - b3; carry0 = 0xffffffff - b4; borrow4 = genb (carry0, borrow3); b4 = carry0 - borrow3; carry1 = 0xffffffff - b5; borrow5 = genb (carry1, borrow4); b5 = carry1 - borrow4; carry2 = 0xffffffff - b6; borrow6 = genb (carry2, borrow5); b6 = carry2 - borrow5; b7 = 1 - b7 - borrow6; /* Compute (s1 + s2 + s3) + (2*P224 - (s4 + s5)) */ c0 = a0 + b0; carry0 = genc (a0, b0); c1 = a1 + b1 + carry0; carry1 = genc (a1, b1) + genc (a1+b1, carry0); c2 = a2 + b2 + carry1; carry2 = genc (a2, b2) + genc (a2+b2, carry1); c3 = a3 + b3 + carry2; carry3 = genc (a3, b3) + genc (a3+b3, carry2); c4 = a4 + b4 + carry3; carry4 = genc (a4, b4) + genc (a4+b4, carry3); c5 = a5 + b5 + carry4; carry5 = genc (a5, b5) + genc (a5+b5, carry4); c6 = a6 + b6 + carry5; carry6 = genc (a6, b6) + genc (a6+b6, carry5); c7 = a7 + b7 + carry6; mask = (c7 == 0) - 1; a0 = c0 - c7; borrow0 = genb (c0, c7); a1 = c1 - borrow0; borrow1 = genb (c1, borrow0); a2 = c2 - borrow1; borrow2 = genb (c2, borrow1); b3 = (0-c7) & mask; a3 = c3 - b3 - borrow2; borrow3 = genb (c3, b3) + genb (c3-b3, borrow2); b4 = 0xFFFFFFFF & mask; a4 = c4 - b4 - borrow3; borrow4 = genb (c4, b4) + genb (c4-b4, borrow3); a5 = c5 - b4 - borrow4; borrow5 = genb (c5, b4) + genb (c5-b4, borrow4); a6 = c6 - b4 - borrow5; borrow6 = genb (c6, b4) + genb (c6-b4, borrow5); b7 = (c7-1) & mask; a7 = c7 - b7 - borrow6; mask = (a7 == 0) - 1; d[0] = a0 - a7; borrow0 = genb (a0, a7); d[1] = a1 - borrow0; borrow1 = genb (a1, borrow0); d[2] = a2 - borrow1; borrow2 = genb (a2, borrow1); b3 = (0-a7) & mask; d[3] = a3 - b3 - borrow2; borrow3 = genb (a3, b3) + genb (a3-b3, borrow2); b4 = 0xFFFFFFFF & mask; d[4] = a4 - b4 - borrow3; borrow4 = genb (a4, b4) + genb (a4-b4, borrow3); d[5] = a5 - b4 - borrow4; borrow5 = genb (a5, b4) + genb (a5-b4, borrow4); d[6] = a6 - b4 - borrow5; } void multiply (__private u32 *c, __private u32 *a, __private u32 *b) { __private u32 d[14]; mul (d, a, b); reduce (c, d); } #define square(c, a) multiply (c, a, a) void add (__private u32 *c, __private u32 *a, __private u32 *b) { u32 carry1, carry2, mask, borrow1, borrow2, t0, t1; carry1 = genc (a[0], b[0]); c[0] = a[0] + b[0]; carry2 = genc (a[1], b[1]) + genc (a[1] + b[1], carry1); c[1] = a[1] + b[1] + carry1; carry1 = genc (a[2], b[2]) + genc (a[2] + b[2], carry2); c[2] = a[2] + b[2] + carry2; carry2 = genc (a[3], b[3]) + genc (a[3] + b[3], carry1); c[3] = a[3] + b[3] + carry1; carry1 = genc (a[4], b[4]) + genc (a[4] + b[4], carry2); c[4] = a[4] + b[4] + carry2; carry2 = genc (a[5], b[5]) + genc (a[5] + b[5], carry1); c[5] = a[5] + b[5] + carry1; carry1 = genc (a[6], b[6]) + genc (a[6] + b[6], carry2); c[6] = a[6] + b[6] + carry2; mask = (~carry1)+1; t1 = 1 & mask; borrow1 = genb (c[0], t1); c[0] = c[0] - t1; borrow2 = genb (c[1], borrow1); c[1] = c[1] - borrow1; borrow1 = genb (c[2], borrow2); c[2] = c[2] - borrow2; t0 = 0xFFFFFFFF & mask; borrow2 = genb (c[3], t0) + genb (c[3] - t0, borrow1); c[3] = c[3] - t0 - borrow1; borrow1 = genb (c[4], t0) + genb (c[4] - t0, borrow2); c[4] = c[4] - t0 - borrow1; borrow2 = genb (c[5], t0) + genb (c[5] - t0, borrow1); c[5] = c[5] - t0 - borrow1; borrow1 = genb (c[6], t0) + genb (c[6] - t0, borrow2); c[6] = c[6] - t0 - borrow1; } void sub (__private u32 *c, __private u32 *b, __private u32 *a) { u32 borrow1, borrow2, mask, t0, t1; borrow1 = genb (b[0], a[0]); c[0] = b[0] - a[0]; borrow2 = genb (b[1], a[1]) + genb (b[1] - a[1], borrow1); c[1] = b[1] - a[1] - borrow1; borrow1 = genb (b[2], a[2]) + genb (b[2] - a[2], borrow2); c[2] = b[2] - a[2] - borrow2; borrow2 = genb (b[3], a[3]) + genb (b[3] - a[3], borrow1); c[3] = b[3] - a[3] - borrow1; borrow1 = genb (b[4], a[4]) + genb (b[4] - a[4], borrow2); c[4] = b[4] - a[4] - borrow2; borrow2 = genb (b[5], a[5]) + genb (b[5] - a[5], borrow1); c[5] = b[5] - a[5] - borrow1; borrow1 = genb (b[6], a[6]) + genb (b[6] - a[6], borrow2); c[6] = b[6] - a[6] - borrow2; mask = (~borrow1)+1; t0 = 1 & mask; borrow1 = genc (c[0], t0); c[0] = c[0] + t0; borrow2 = genc (c[1], borrow1); c[1] = c[1] + borrow1; borrow1 = genc (c[2], borrow2); c[2] = c[2] + borrow2; t1 = 0xFFFFFFFF & mask; borrow2 = genc (c[3], t1) + genc (c[3]+t1, borrow1); c[3] = c[3] + t1 + borrow1; borrow1 = genc (c[4], t1) + genc (c[4]+t1, borrow2); c[4] = c[4] + t1 + borrow2; borrow2 = genc (c[5], t1) + genc (c[5]+t1, borrow1); c[5] = c[5] + t1 + borrow1; borrow1 = genc (c[6], t1) + genc (c[6]+t1, borrow2); c[6] = c[6] + t1 + borrow2; } void div2_8 (__private u32 *b, __private u32 *a) { b[0] = (a[0] >> 1) | (a[1] << 31); b[1] = (a[1] >> 1) | (a[2] << 31); b[2] = (a[2] >> 1) | (a[3] << 31); b[3] = (a[3] >> 1) | (a[4] << 31); b[4] = (a[4] >> 1) | (a[5] << 31); b[5] = (a[5] >> 1) | (a[6] << 31); b[6] = (a[6] >> 1) | (a[7] << 31); } void div2_7 (__private u32 *b, __private u32 *a) { b[0] = (a[0] >> 1) | (a[1] << 31); b[1] = (a[1] >> 1) | (a[2] << 31); b[2] = (a[2] >> 1) | (a[3] << 31); b[3] = (a[3] >> 1) | (a[4] << 31); b[4] = (a[4] >> 1) | (a[5] << 31); b[5] = (a[5] >> 1) | (a[6] << 31); b[6] = (a[6] >> 1); } /*****************************************************************************/ /* Curve arithmetic */ /*****************************************************************************/ void add_point (__private Point *P3, __private Point *P1, __private Point *P2, __private u32 *p) { __private u32 t0[7], t1[7], Z1Z1[7], Z2Z2[7], U1[7], U2[7], S1[7], S2[7], H[7], I[7], r[7], J[7], V[7]; square (Z1Z1, P1->z); square (Z2Z2, P2->z); multiply (U1, P1->x, Z2Z2); multiply (U2, P2->x, Z1Z1); multiply (S1, P1->y, P2->z); multiply (S1, S1, Z2Z2); multiply (S2, P2->y, P1->z); multiply (S2, S2, Z1Z1); sub (H, U2, U1); add (I, H, H); // I:=(2*H)^2; square (I, I); multiply (J, H, I); sub (r, S2, S1); add (r, r, r); // r:=2*(S2-S1); multiply (V, U1, I); // x3:=r^2-J-2*V; square (t0, r); add (t1, V, V); sub (P3->x, t0, J); sub (P3->x, P3->x, t1); // Y3:=r*(V-X3)-2*S1*J; sub (t0, V, P3->x); multiply (r, r, t0); multiply (t1, S1, J); add (t1, t1, t1); sub (P3->y, r, t1); // Z3:=((Z1+Z2)^2-Z1Z1-Z2Z2)*H; add (t0, P1->z, P2->z); square (t0, t0); sub (t0, t0, Z1Z1); sub (t0, t0, Z2Z2); multiply (P3->z, t0, H); } #define mul8(b,a) \ do { \ u32 T[7]; \ add (T, a, a); \ add (T, T, T); \ add (b, T, T); \ } while (0) #define mul4(b,a) \ do { \ u32 T[7]; \ add (T, a, a); \ add (b, T, T); \ } while (0) void double_point (__private Point *P3, __private Point *P1, __private u32 * p) { __private u32 alpha[7], beta[7], gamma[7], delta[7], t0[7], t1[7]; square (delta, P1->z); square (gamma, P1->y); multiply (beta, P1->x, gamma); // alpha:=3*(X1-delta)*(X1+delta); sub (t0, P1->x, delta); add (t1, P1->x, delta); multiply (t0, t0, t1); add (alpha, t0, t0); add (alpha, alpha, t0); // X3:=alpha^2-8*beta; square (t0, alpha); mul8 (t1, beta); sub (P3->x, t0, t1); // Z3:=(Y1+Z1)^2-gamma-delta; add (P3->z, P1->y, P1->z); square (P3->z, P3->z); sub (P3->z, P3->z, gamma); sub (P3->z, P3->z, delta); // Y3:=alpha*(4*beta-X3)-8*gamma^2; mul4 (t0, beta); sub (t0, t0, P3->x); multiply (t0, t0, alpha); square (t1, gamma); mul8 (t1, t1); sub (P3->y, t0, t1); } void mul_point (__private Point *P3, __private Point *P, __private u32 *c, __private u32 *p) { u32 first = 1; __private u32 k[7], k2[7]; __private Point P1; set (P1.x, P->x); set (P1.y, P->y); set (k, c); set (k2, c); /* If this while loop is removed the code compiles. */ while ((k[0]|k[1]|k[2]|k[3]|k[4]|k[5]|k[6]) != 0) { if ((k[0] & 0x1) == 1) { if (first) { first = 0; set (P3->x, P1.x); set (P3->y, P1.y); } else { add_point (P3, &P1, P3, p); } } double_point (&P1, &P1, p); div2_7 (k, k); //if (((k[0]-k2[0])|(k[1]-k2[1])|(k[2]-k2[2])|(k[3]-k2[3])|(k[4]-k2[4])|(k[5]-k2[5])|(k[6]-k2[6])) == 0) { // break; //} set (k2, k); } } __kernel void templateKernel(__global unsigned int * output) { __private u32 p[7], scalar[7]; __private Point G, P; p[0] = 0x00000001; p[1] = 0x00000000; p[2] = 0x00000000; p[3] = 0xFFFFFFFF; p[4] = 0xFFFFFFFF; p[5] = 0xFFFFFFFF; p[6] = 0xFFFFFFFF; G.x[0] = 0x115c1d21; G.x[1] = 0x343280d6; G.x[2] = 0x56c21122; G.x[3] = 0x4a03c1d3; G.x[4] = 0x321390b9; G.x[5] = 0x6bb4bf7f; G.x[6] = 0xb70e0cbd; G.y[0] = 0x85007e34; G.y[1] = 0x44d58199; G.y[2] = 0x5a074764; G.y[3] = 0xcd4375a0; G.y[4] = 0x4c22dfe6; G.y[5] = 0xb5f723fb; G.y[6] = 0xbd376388; scalar[0] = 0; //get_global_id (0); scalar[1] = 0; scalar[2] = 0; scalar[3] = 0; scalar[4] = 0; scalar[5] = 0; scalar[6] = 0; G.z[0] = 1; G.z[1] = 0; G.z[2] = 0; G.z[3] = 0; G.z[4] = 0; G.z[5] = 0; G.z[6] = 0; mul_point (&P, &G, scalar, p); output[get_global_id(0)] = P.x[0]; }