Comrads, I'm geting frustrated about math accuracy in OpenCL
Here is the problem, simple multiplication:
0.647059 * -1.148511 = -1.040416
WHY IT IS SO?
I suppose it is some compiler configs, but i don't know which o_O
You need to post some code that demonstrates this problem.
Here it is. Part of pattern recognition kernel. Performs matrix-vector multiplication. Printf outputs results of multiplication vector on last row of matrix.
/*Kernelzzzz*/
#define lL1 130
#define lL2 100
#define lL3 36
#pragma OPENCL EXTENSION cl_amd_printf:enable
__kernel void recKernel(__global float * output, __global uchar * input,
__global float * IW, __global float * LW1,
__global float * LW2, __global float * B1,
__global float * B2, __global float * B3, uint L1,
uint L2, uint L3)
{
uint x = get_global_id(0);
__local float A1[lL1];
__local float fin[lL1];
uint i;
float A;
fin
if(x < lL1){
IW += x;
A = 0;//B1
#pragma unroll 2
for(i = 0; i < lL1; i++){
A = fin * *IW;
//A += fin * *IW;
if(x==129)
printf("%f * %f = %f\n", fin, *IW, fin * *IW);
IW += lL1;
}
A1
}
UPD: I have triple-checked my source. Found there a couple of errors that affected accuracy. Now all is OK. Thanks for your attention!
Originally posted by: player999 Comrads, I'm geting frustrated about math accuracy in OpenCL
Here is the problem, simple multiplication:
0.647059 * -1.148511 = -1.040416
WHY IT IS SO?
That ISN"T a precision issue unless somthign has gone horribly horribly wrong. Plus unless you have a fast relaxed math enabled you shouldn't have concerns with precision at least not at these levels. To be absolutely sure use the flag "-cl-opt-disable" to disable all optimizations.
Some inital thoughts You seem to be trying to unroll the loop with a pragma call 2 times for a look with 130 iterations. Why? Also are you sure you are not overridding some of your local variables with calculations from the surrounding calls?
OK, I've understood it already. And already disabled math optimiztions.
Why am i trying to unroll? I don't know-- profiler showed, that unrolling adds few nanoseconds 🙂 Actually all three sequential matrix-vector multiplications performing 92 microseconds. About overriding -- no i do not. Now my calculations precise enough and code is not buggy.
Nevertheless -- i've one more issue. I've sequentially enqued writing to GPU memory (loveland, AMD Fusion E-350) and kernel execution. When i measuring separatly time of execution of copy and kernel procedures, i get 35us and 92us respectively, but when i measure overall time of execution (from start of copying operation to the end of kernel execution), i get 1.4 ms. As mentioned before, all measurments are carried out by profiler function calls. Please, help me to understand why.
UPD: Oh, almost forgotten. The source!
status = clEnqueueWriteBuffer(
NN->commandQueue,
NN->devInput,
CL_TRUE,
0,
LAYER1 * sizeof(char),
NN->hostInput,
0,
NULL,
&events[0]);
#ifdef DEBUG
if(status != CL_SUCCESS)
{
printf(
"Error: clEnqueueReadBuffer failed. \
(clEnqueueReadBuffer)\n");
return;
}
#endif
globalThreads[0] = 130;
status = clEnqueueNDRangeKernel(
NN->commandQueue,
NN->recKernel,
1,
NULL,
globalThreads,
NULL,
1,
events,
&events[1]);
#ifdef DEBUG
if(status != CL_SUCCESS)
{
printf(
"Error: Enqueueing kernel recKernel onto command queue. \
(clEnqueueNDRangeKernel)\n");
printf("Error code: %d\n", status);
return;
}
#endif
status = clWaitForEvents(1, &events[1]);
#ifdef DEBUG
if(status != CL_SUCCESS)
{
printf(
"Error: Waiting for kernel run to finish. \
(clWaitForEvents)\n");
return;
}
#endif
#ifdef PROFILING
cl_ulong startTime, endTime;
clGetEventProfilingInfo(events[0], CL_PROFILING_COMMAND_START,
sizeof(cl_ulong), &startTime, NULL);
clGetEventProfilingInfo(events[0], CL_PROFILING_COMMAND_END,
sizeof(cl_ulong), &endTime, NULL);
printf("Copy input: %d ns\n", endTime - startTime);
clGetEventProfilingInfo(events[1], CL_PROFILING_COMMAND_START,
sizeof(cl_ulong), &startTime, NULL);
clGetEventProfilingInfo(events[1], CL_PROFILING_COMMAND_END,
sizeof(cl_ulong), &endTime, NULL);
printf("Kernel: %d ns\n", endTime - startTime);
clGetEventProfilingInfo(events[0], CL_PROFILING_COMMAND_START,
sizeof(cl_ulong), &startTime, NULL);
clGetEventProfilingInfo(events[1], CL_PROFILING_COMMAND_END,
sizeof(cl_ulong), &endTime, NULL);
printf("Summary: %d ns\n", endTime - startTime);
#endif