Hi guys,
I've been trying to write a kernel which multiplies a row vector into a CSR format stored matrix, but it gives a different result each time it runs. The GPU is AMD Radeon™ HD 7970 Graphics.
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
__kernel void MatMulTF(__global float *A, __global int *row, __global int *col, __global float *Rs, volatile __global float *As) {
int gid=get_global_id(0);
int j=row[gid];
int a1=row[gid+1];
float Rsgid=Rs[gid];
float rp;
int colj;
volatile __global unsigned int *pAs;
union {
unsigned int intVal;
float floatVal;
} newVal, prevVal;
for (;j<a1; j++) {
rp=A*Rsgid;
colj=col;
pAs=(volatile __global unsigned int *) (&As[colj]);
do {
prevVal.floatVal = As[colj];
newVal.floatVal = prevVal.floatVal + rp;
} while (atomic_cmpxchg(pAs, prevVal.intVal, newVal.intVal) != prevVal.intVal);
}
}
Can anybody help me? thanks.