cancel
Showing results for 
Search instead for 
Did you mean: 

Archives Discussions

gale6k
Journeyman III

Opencl develop: Different results between AMD and Nvidea devices.

My kernel works well on Nvidea GTX 560 GPU device but works error on AMD A7970 device. The source code list bellow.

A7970's outputs is 0xd7 eb 6a d7 d7 eb 6a d7, but GTX560's is 0xd7 eb 6a d7 05 b5 30 ad.

Where the problem appear. :confused:

// kernel source code -------------------------------------------

typedef union USHA1_type

{

unsigned int sha1uint[5];

unsigned char sha1uchar[20];

}USHA1_t;

inline uint SWAP32(uint x)

{

x = rotate(x, 16U);

return ((x & 0x00FF00FF) << 😎 + ((x >> 😎 & 0x00FF00FF);

}

//sha1 ==================

#define K1 0x5A827999

#define K2 0x6ED9EBA1

#define K3 0x8F1BBCDC

#define K4 0xCA62C1D6

#define H1 0x67452301

#define H2 0xEFCDAB89

#define H3 0x98BADCFE

#define H4 0x10325476

#define H5 0xC3D2E1F0

#define F1(x,y,z) (z ^ (x & (y ^ z)))

#define F2(x,y,z) (x ^ y ^ z)

#define F3(x,y,z) ((x & y) | (z & (x | y)))

#define F4(x,y,z) (x ^ y ^ z)

#define R(t) (temp = W[(t - 3) & 0x0F] ^ W[(t - 😎 & 0x0F] ^ W[(t - 14) & 0x0F] ^ W[t & 0x0F], ( W[t & 0x0F] = rotate((int)temp,1) ) )

#define P1(a,b,c,d,e,x) \

{ \

e += rotate((int)a,5) + F1(b,c,d) + K1 + x; b = rotate((int)b,30);\

}

#define P2(a,b,c,d,e,x) \

{ \

e += rotate((int)a,5) + F2(b,c,d) + K2 + x; b = rotate((int)b,30);\

}

#define P3(a,b,c,d,e,x) \

{ \

e += rotate((int)a,5) + F3(b,c,d) + K3 + x; b = rotate((int)b,30);\

}

#define P4(a,b,c,d,e,x) \

{ \

e += rotate((int)a,5) + F4(b,c,d) + K4 + x; b = rotate((int)b,30);\

}

//1-63 BYTES sha1

inline void sha1_crypt(__private unsigned char *plain, unsigned int plainlen, __private unsigned int *digest)

{

int t;

int stop, mmod;

unsigned int i, ulen;

unsigned int W[16] = {0};

unsigned int temp, A,B,C,D,E;

A = H1;

B = H2;

C = H3;

D = H4;

E = H5;

for (t = 1; t < 15; t++)

{

W = 0x00000000;

}

i = plainlen;

stop = i / 4 ;

for (t = 0 ; t < stop ; t++){

W = ((unsigned char) plain[t * 4]) << 24;

W |= ((unsigned char) plain[t * 4 + 1]) << 16;

W |= ((unsigned char) plain[t * 4 + 2]) << 8;

W |= (unsigned char) plain[t * 4 + 3];

}

mmod = i % 4;

if ( mmod == 3){

W = ((unsigned char) plain[t * 4]) << 24;

W |= ((unsigned char) plain[t * 4 + 1]) << 16;

W |= ((unsigned char) plain[t * 4 + 2]) << 8;

W |= ((unsigned char) 0x80) ;

} else if (mmod == 2) {

W = ((unsigned char) plain[t * 4]) << 24;

W |= ((unsigned char) plain[t * 4 + 1]) << 16;

W |= 0x8000 ;

} else if (mmod == 1) {

W = ((unsigned char) plain[t * 4]) << 24;

W |= 0x800000 ;

} else /*if (mmod == 0)*/ {

W = 0x80000000 ;

}

ulen = (i * 😎 & 0xFFFFFFFF;

W[15] = ulen ;

P1( A, B, C, D, E, W[0] );

P1( E, A, B, C, D, W[1] );

P1( D, E, A, B, C, W[2] );

P1( C, D, E, A, B, W[3] );

P1( B, C, D, E, A, W[4] );

P1( A, B, C, D, E, W[5] );

P1( E, A, B, C, D, W[6] );

P1( D, E, A, B, C, W[7] );

P1( C, D, E, A, B, W[8] );

P1( B, C, D, E, A, W[9] );

P1( A, B, C, D, E, W[10] );

P1( E, A, B, C, D, W[11] );

P1( D, E, A, B, C, W[12] );

P1( C, D, E, A, B, W[13] );

P1( B, C, D, E, A, W[14] );

P1( A, B, C, D, E, W[15] );

P1( E, A, B, C, D, R(16) );

P1( D, E, A, B, C, R(17) );

P1( C, D, E, A, B, R(18) );

P1( B, C, D, E, A, R(19) );

P2( A, B, C, D, E, R(20) );

P2( E, A, B, C, D, R(21) );

P2( D, E, A, B, C, R(22) );

P2( C, D, E, A, B, R(23) );

P2( B, C, D, E, A, R(24) );

P2( A, B, C, D, E, R(25) );

P2( E, A, B, C, D, R(26) );

P2( D, E, A, B, C, R(27) );

P2( C, D, E, A, B, R(28) );

P2( B, C, D, E, A, R(29) );

P2( A, B, C, D, E, R(30) );

P2( E, A, B, C, D, R(31) );

P2( D, E, A, B, C, R(32) );

P2( C, D, E, A, B, R(33) );

P2( B, C, D, E, A, R(34) );

P2( A, B, C, D, E, R(35) );

P2( E, A, B, C, D, R(36) );

P2( D, E, A, B, C, R(37) );

P2( C, D, E, A, B, R(38) );

P2( B, C, D, E, A, R(39) );

P3( A, B, C, D, E, R(40) );

P3( E, A, B, C, D, R(41) );

P3( D, E, A, B, C, R(42) );

P3( C, D, E, A, B, R(43) );

P3( B, C, D, E, A, R(44) );

P3( A, B, C, D, E, R(45) );

P3( E, A, B, C, D, R(46) );

P3( D, E, A, B, C, R(47) );

P3( C, D, E, A, B, R(48) );

P3( B, C, D, E, A, R(49) );

P3( A, B, C, D, E, R(50) );

P3( E, A, B, C, D, R(51) );

P3( D, E, A, B, C, R(52) );

P3( C, D, E, A, B, R(53) );

P3( B, C, D, E, A, R(54) );

P3( A, B, C, D, E, R(55) );

P3( E, A, B, C, D, R(56) );

P3( D, E, A, B, C, R(57) );

P3( C, D, E, A, B, R(58) );

P3( B, C, D, E, A, R(59) );

P4( A, B, C, D, E, R(60) );

P4( E, A, B, C, D, R(61) );

P4( D, E, A, B, C, R(62) );

P4( C, D, E, A, B, R(63) );

P4( B, C, D, E, A, R(64) );

P4( A, B, C, D, E, R(65) );

P4( E, A, B, C, D, R(66) );

P4( D, E, A, B, C, R(67) );

P4( C, D, E, A, B, R(68) );

P4( B, C, D, E, A, R(69) );

P4( A, B, C, D, E, R(70) );

P4( E, A, B, C, D, R(71) );

P4( D, E, A, B, C, R(72) );

P4( C, D, E, A, B, R(73) );

P4( B, C, D, E, A, R(74) );

P4( A, B, C, D, E, R(75) );

P4( E, A, B, C, D, R(76) );

P4( D, E, A, B, C, R(77) );

P4( C, D, E, A, B, R(78) );

P4( B, C, D, E, A, R(79) );

digest[0] = SWAP32(A + H1);

digest[1] = SWAP32(B + H2);

digest[2] = SWAP32(C + H3);

digest[3] = SWAP32(D + H4);

digest[4] = SWAP32(E + H5);

}

__kernel void test_sha1_kernel(__global unsigned int* gout)

{

unsigned int id = get_global_id(0);

unsigned int i = 0;

//two input data

unsigned char InData1[16] = {0};

unsigned char InData2[16] = {0};

//two calout data

USHA1_t sha1out1;

USHA1_t sha1out2;

//init data

for(i = 0; i < 5; i ++)

{

sha1out1.sha1uint = 0;

sha1out2.sha1uint = 0;

}

for(i = 0; i < 16; i ++)

{

InData1 = 0x03;

InData2 = 0x38;

}

//two out temp

unsigned char out1[4] = {0};

unsigned char out2[4] = {0};

for(i = 0; i < 4; i++)

{

out1 = 0;

out2 = 0;

}

//cal 1

unsigned int *psha1out1 = (unsigned int *)(sha1out1.sha1uint);

sha1_crypt(InData1, 8, psha1out1);

sha1_crypt(InData1, 8, psha1out1);

//save output1

for(i = 0; i < 4; i++)

{

out1 = sha1out1.sha1uchar;

}

//cal 2

unsigned int *psha1out2 = (unsigned int *)(sha1out2.sha1uint);

sha1_crypt(InData2, 8, psha1out2);

//save output2

for(i = 0; i < 4; i++)

{

out2 = sha1out2.sha1uchar;

}

//out to cpu

if(id == 0)

{

gout[0] = (unsigned int)out1[0];

gout[1] = (unsigned int)out1[1];

gout[2] = (unsigned int)out1[2];

gout[3] = (unsigned int)out1[3];

gout[4] = (unsigned int)out2[0];

gout[5] = (unsigned int)out2[1];

gout[6] = (unsigned int)out2[2];

gout[7] = (unsigned int)out2[3];

}

}

0 Likes
0 Replies