Here is the OpenCL (I've marked the statements that seem to cause the issue - lines 8 and 21):
(If I were to change tempint on those lines to any literal uint the kernel compiles fine - madness)
uint wide_add_vector(uint* res, const uint* a, const uint* b)
{
ulong carry=0;
#pragma unroll
for(uint i=0;i<4;i++){
ulong tmp=(ulong)(a)+b+carry;
uint tempint = (uint)(tmp&0xFFFFFFFF);
res = tempint; // <---- Problem statement
carry=tmp>>32;
}
return carry;
}
uint wide_add_scalar(uint* res, const uint* a, uint b)
{
ulong carry=b;
#pragma unroll
for(uint i=0;i<4;i++){
ulong tmp=a+carry;
uint tempint = (uint)(tmp&0xFFFFFFFF);
res = tempint; // <---- Problem statement
carry=tmp>>32;
}
return carry;
}
void wide_mul(uint* res_hi, uint* res_lo, const uint* a, const uint* b)
{
ulong carry=0, acc=0;
#pragma unroll
for(uint i=0; i<4; i++){
#pragma unroll
for(uint j=0; j<=i; j++){
ulong tmp=(ulong)(a
)*b[i-j]; acc+=tmp;
carry+=(acc < tmp);
}
res_lo=(uint)(acc&0xFFFFFFFF);
acc= (carry<<32) | (acc>>32);
carry=carry>>32;
}
#pragma unroll
for(uint i=1; i<4; i++){
#pragma unroll
for(uint j=i; j<4; j++){
ulong tmp=(ulong)(a
)*b[4-j+i-1]; acc+=tmp;
carry+=(acc < tmp);
}
res_hi[i-1]=(uint)(acc&0xFFFFFFFF);
acc= (carry<<32) | (acc>>32);
carry=carry>>32;
}
res_hi[3]=acc;
}
void wide_copy_global(__global uint *res, const uint *a)
{
#pragma unroll
for(uint i=0;i<8;i++){
res=a;
}
}
__kernel void bitecoin_miner(ulong roundId,ulong roundSalt,ulong chainHash, uint4 c, uint hashSteps, __global uint* proofBuffer)
{
uint workerID = get_global_id(0);
uint cArray[4] = {c.x,c.y,c.z,c.w};
uint x[8] = {workerID,0,(uint)roundId,(uint)roundId,(uint)roundSalt,(uint)roundSalt,(uint)chainHash,(uint)chainHash};
for(uint j=0;j<hashSteps;j++)
{
uint tmp[8];
wide_mul(tmp+4, tmp, x, cArray); // cArray; not to be confused with carry.
uint carry=wide_add_vector(x, tmp, x+4);
wide_add_scalar(x+4, tmp+4, carry);
}
wide_copy_global(proofBuffer+8*workerID,x);
}
When run I get:
LogLevel = 2 -> 2
[MyClient], 1395075385.62, 2, Created log.
Will try to connect to address Minty at port 4000
Found 1 platforms
Platform 0 : Advanced Micro Devices, Inc.
Choosing platform 0
Found 2 devices
Device 0 : Tahiti
Device 1 : Intel(R) Core(TM) i7-4770K CPU @ 3.50GHz
Choosing device 0
LLVM ERROR: Cannot select: 0x855acbc3a0: i32 = setcc 0x855acbcca0, 0x855ac3a080, 0x855ac3a480 [ORD=52] [ID=30]
0x855acbcca0: i64 = add 0x855ac3a080, 0x855ac3aa80 [ORD=49] [ID=28]
0x855ac3a080: i64,ch = CopyFromReg 0x855ac2b1d0, 0x855ac3a680 [ORD=49] [ID=19]
0x855ac3a680: i64 = Register %vreg33 [ORD=49] [ID=7]
0x855ac3aa80: i64 = mul 0x855acbcda0, 0x855ac37450 [ORD=48] [ID=27]
0x855acbcda0: i64,ch = load 0x855ac2b1d0, 0x855ac37250, 0x855ac3a380<LD4[%scevgep106], zext from i32> [ORD=47] [ID=26]
0x855ac37250: i32 = add 0x855ac36640, 0x855ac38960 [ORD=45] [ID=25]
0x855ac36640: i32 = sub 0x855ac37850, 0x855ac37050 [ORD=44] [ID=24]
0x855ac37850: i32 = FrameIndex<0> [ORD=41] [ID=1]
0x855ac37050: i32 = shl 0x855acbbc90, 0x855ac3a980 [ORD=44] [ID=23]
0x855acbbc90: i32,ch = CopyFromReg 0x855ac2b1d0, 0x855ac36940 [ORD=43] [ID=18]
0x855ac36940: i32 = Register %vreg30 [ORD=43] [ID=3]
0x855ac3a980: i32 = Constant<2> [ORD=44] [ID=4]
0x855ac38960: i32 = Constant<8> [ORD=45] [ID=5]
0x855ac3a380: i32 = undef [ORD=46] [ID=6]
0x855ac37450: i64 = zero_extend 0x855acbbd90 [ORD=42] [ID=21]
0x855acbbd90: i32,ch = CopyFromReg 0x855ac2b1d0, 0x855acbba90 [ORD=42] [ID=17]
0x855acbba90: i32 = Register %vreg31 [ORD=42] [ID=2]
0x855ac3a080: i64,ch = CopyFromReg 0x855ac2b1d0, 0x855ac3a680 [ORD=49] [ID=19]
0x855ac3a680: i64 = Register %vreg33 [ORD=49] [ID=7]
In function: __OpenCL_bitecoin_miner_kernel
Press any key to continue . . .
If I put it into Kernel Analyzer it just freezes.
Any ideas?
The system is:
Windows 8.1 64-bit, Visual Studio 2013
HD7970 Driver Version 13.350.1005.0
Catalyst 14.2
AMD APP SDK 2.9
Many Thanks
Henry
Solved! Go to Solution.
Hi henry931,
Your issue is not clear from your post. Can you please share your Host code with us so that we can reproduce your issue here and keep you updated.
Thanks,
AMD_Support
Hi henry931,
Your issue is not clear from your post. Can you please share your Host code with us so that we can reproduce your issue here and keep you updated.
Thanks,
AMD_Support
Sorry for the late reply anyway I got it to compile (and work) by disabling optimisations.
This was for a piece of coursework which we have since completed so it doesn't matter to me anymore...
but if you want the whole source it is on this branch: