Dear,
I am trying to porting the openCL source code to AMD old gpu card: Rx570 (4G)。The source code can work correctly on Nvidia cards, but it failed on Rx570 card.
The clinfo is attached.
The kernel source code is as follows:
__kernel void G1_bellman_multiexp(
__global G1_affine *bases,
__global G1_projective *buckets,
__global G1_projective *results,
__global Exp *exps,
uint n,
uint num_groups,
uint num_windows,
uint window_size) {
// We have `num_windows` * `num_groups` threads per multiexp.
uint gid = get_global_id(0);
if(gid >= num_windows * num_groups) return;
// We have (2^window_size - 1) buckets.
uint bucket_len = ((1 << window_size) - 1);
// Each thread has its own set of buckets in global memory.
buckets += bucket_len * gid;
for(uint i = 0; i < bucket_len; i++) buckets[i] = G1_ZERO;
uint len = (uint)ceil(n / (float)num_groups); // Num of elements in each group
// This thread runs the multiexp algorithm on elements from `nstart` to `nened`
// on the window [`bits`, `bits` + `w`)
uint nstart = len * (gid / num_windows);
uint nend = min(nstart + len, n);
uint bits = (gid % num_windows) * window_size;
ushort w = min((ushort)window_size, (ushort)(Exp_BITS - bits));
//results[gid] = G1_ZERO;
G1_projective res = G1_ZERO;
for(uint i = nstart; i < nend; i++) {
uint ind = Exp_get_bits(exps[i], bits, w);
// Special case where it is faster to add the base into `res` instead of
// `bucket[0]`.
if(ind == 1) {
res = G1_add_mixed(res, bases[i]);
} else if(ind--) buckets[ind] = G1_add_mixed(buckets[ind], bases[i]);
}
// Summation by parts
// e.g. 3a + 2b + 1c = a +
// (a) + b +
// ((a) + b) + c
G1_projective acc = G1_ZERO;
for(int j = bucket_len - 1; j >= 0; j--) {
acc = G1_add(acc, buckets[j]);
res = G1_add(res, acc);
}
results[gid] = res;
}
G1_projective G1_add_mixed(G1_projective a, G1_affine b) {
if(b.inf) return a;
if(Fq_eq(a.z, Fq_ZERO)) {
a.x = b.x;
a.y = b.y;
a.z = Fq_ONE;
return a;
}
Fq z1z1 = Fq_sqr(a.z);
Fq u2 = Fq_mul(b.x, z1z1);
Fq s2 = Fq_mul(Fq_mul(b.y, a.z), z1z1);
if(Fq_eq(a.x, u2) && Fq_eq(a.y, s2))
return G1_double(a);
else {
Fq h = Fq_sub(u2, a.x); // H = U2-X1
Fq hh = Fq_sqr(h); // HH = H^2
Fq i = Fq_double(hh); i = Fq_double(i); // I = 4*HH
Fq j = Fq_mul(h, i); // J = H*I
Fq r = Fq_sub(s2, a.y); r = Fq_double(r); // r = 2*(S2-Y1)
Fq v = Fq_mul(a.x, i);
G1_projective ret;
// X3 = r^2 - J - 2*V
ret.x = Fq_sub(Fq_sub(Fq_sqr(r), j), Fq_double(v));
// Y3 = r*(V-X3)-2*Y1*J
j = Fq_mul(a.y, j); j = Fq_double(j);
ret.y = Fq_sub(Fq_mul(Fq_sub(v, ret.x), r), j);
// Z3 = (Z1+H)^2-Z1Z1-HH
ret.z = Fq_add(a.z, h); ret.z = Fq_xsub(Fq_sub(Fq_sqr(ret.z), z1z1), hh);
return ret;
}
}
The strange thing is that the “hh” variable of the “ret.z = Fq_xsub(Fq_sub(Fq_sqr(ret.z), z1z1), hh);” is NOT correct.
But if I removed some logic ( for example, remove ret.y calculation), the “ret.z” can calculated correctly.
Could you please give me some suggestion about this kind of error? Thanks a lot :)