mmarc

Incorrect result from AMD OpenCL BLAS trmm

Discussion created by mmarc on Jan 8, 2011
Latest reply on Jan 12, 2011 by mmarc
trmm is the one function that works, but the result is incorrect

Hi,

 

I tried some tests with clAmdBlasStrmm and confused with result. It turns that it gives incorrect result.

 

The clAmdBlasStrmm is invoked in the following way:

        clAmdBlasOrder order = clAmdBlasColumnMajor;
        clAmdBlasSide side = clAmdBlasLeft;
        clAmdBlasUplo uploA = clAmdBlasUpper;
        clAmdBlasTranspose transA = clAmdBlasNoTrans;
        clAmdBlasDiag diagA = clAmdBlasNonUnit;
        size_t lda = M, ldb = N;
        result = clAmdBlasStrmm(
            order, side, uploA, transA, diagA, M, N,
            alpha, bufA, lda, bufB, ldb, 1, &queue,
            0, NULL, &event);

After that, using copies of original A1 and B1 I compute result using trmm_:

        char side = 'L';
        char uploA = 'U';
        char transA = 'N';
        char diagA = 'N';
        int m = (int)M, n = (int)N;
        int lda = m, ldb = n;
        strmm_(&side, &uploA, &transA, &diagA, &m, &n,
            &alpha, A2, &lda, B2, &ldb);

 

Here's a test case. Input A and B matrices:

0.840188 0.798440 0.335223     0.553970 0.364784 0.916195
0.394383 0.911647 0.768230     0.477397 0.513401 0.635712
0.783099 0.197551 0.277775     0.628871 0.952230 0.717297

clAmdBlasStrmm(A, B) output from AMD OpenCL BLAS

0.840188 0.798440 0.335223     1.057424 1.035616 1.517808
0.394383 0.911647 0.768230     1.136811 1.343436 1.491925
0.783099 0.197551 0.277775     0.702808 0.651591 1.042304

strmm(A, B) output from CPU Netlib BLAS

0.840188 0.798440 0.335223     1.057424 1.035616 1.517808
0.394383 0.911647 0.768230     0.918335 1.199572 1.130594
0.783099 0.197551 0.277775     0.174684 0.264505 0.199247

Extra check with Octave

octave:6> A = [0.840188, 0.798440, 0.335223; 0, 0.911647, 0.768230; 0, 0, 0.277775]
A =

   0.84019   0.79844   0.33522
   0.00000   0.91165   0.76823
   0.00000   0.00000   0.27777

octave:7> B = [0.553970, 0.364784, 0.916195; 0.477397, 0.513401, 0.635712; 0.628871, 0.952230, 0.717297]
B =

   0.55397   0.36478   0.91619
   0.47740   0.51340   0.63571
   0.62887   0.95223   0.71730

octave:8> C = A * B
C =

   1.05742   1.03562   1.51781
   0.91834   1.19957   1.13059
   0.17468   0.26451   0.19925

So, presuming trmm and octave give equal and very likely correct result, there is something wrong with AMD's implementation or I used wrong params for invocation. Could you please check?

 

Attaching complete source code for my test.

 

Thanks.

#include <assert.h> #include <sys/types.h> #include <stdio.h> #include <string.h> #include <clAmdBlas.h> #include "timing.h" // Maximum difference for comparison. float eps = 1e-6; // The CPU BLAS function. void strmm_( char* side, char* uploA, char* transA, char* diagA, int* M, int* N, float* alpha, float* A2, int* lda, float* B2, int* ldb); cl_float* gendata(size_t size) { cl_float* data = (cl_float*)malloc(size * sizeof(cl_float)); double drandmax = (double)RAND_MAX; for (int i = 0; i < size; i++) data[i] = (cl_float)(rand() / drandmax); return data; } int main(int argc, const char *argv[]) { if (argc != 4) { printf("Usage: %s <M> <N> <alpha>\n", argv[0]); return 0; } size_t M = atoi(argv[1]); assert(M >= 0); size_t N = atoi(argv[2]); assert(N >= 0); cl_float alpha = atof(argv[3]); // Setup OpenCL environment. cl_platform_id platform; cl_int err = clGetPlatformIDs(1, &platform, NULL); cl_device_id device; err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; props[1] = (cl_context_properties)platform; cl_context ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); cl_command_queue queue = clCreateCommandQueue(ctx, device, 0, &err); // Setup clAmdBlas. err = clAmdBlasSetup(); if (err != CL_SUCCESS) { printf("clAmdBlasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return err; } // Generate A and B matrices. printf("Generating test data ... "); struct time_t begin; get_time(&begin); cl_float* A1 = gendata(M * M); cl_float* A2 = (cl_float*)malloc(M * M * sizeof(cl_float)); memcpy(A2, A1, M * M * sizeof(cl_float)); cl_float* B1 = gendata(M * N); cl_float* B2 = (cl_float*)malloc(M * M * sizeof(cl_float)); memcpy(B2, B1, M * N * sizeof(cl_float)); struct time_t end; get_time(&end); printf("done in %f seconds\n", get_time_diff(&begin, &end)); // Output matrixes if they are small. if ((M <= 3) && (N <= 3)) { for (int i = 0; i < M; i++) { for (int j = 0; j < M; j++) printf("%f ", A1[i + j * M]); printf("\t"); for (int j = 0; j < N; j++) printf("%f ", B1[i + j * M]); printf("\n"); } } // Prepare OpenCL memory objects and place matrices inside them. cl_mem bufA = clCreateBuffer( ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, M * M * sizeof(*A1), (void*)A1, &err); cl_mem bufB = clCreateBuffer( ctx, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, M * N * sizeof(*B1), (void*)B1, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, M * M * sizeof(*A1), A1, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0, M * N * sizeof(*B1), B1, 0, NULL, NULL); // Call clAmdBlas function. printf("Executing AMD BLAS OpenCL kernel ... "); cl_event event = NULL; cl_int result = CL_SUCCESS; get_time(&begin); { clAmdBlasOrder order = clAmdBlasColumnMajor; clAmdBlasSide side = clAmdBlasLeft; clAmdBlasUplo uploA = clAmdBlasUpper; clAmdBlasTranspose transA = clAmdBlasNoTrans; clAmdBlasDiag diagA = clAmdBlasNonUnit; size_t lda = M, ldb = N; result = clAmdBlasStrmm( order, side, uploA, transA, diagA, M, N, alpha, bufA, lda, bufB, ldb, 1, &queue, 0, NULL, &event); } if (result == CL_SUCCESS) { // Wait for calculations to be finished. err = clWaitForEvents(1, &event); get_time(&end); printf("done in %f seconds\n", get_time_diff(&begin, &end)); // Fetch results of calculations from GPU memory. err = clEnqueueReadBuffer( queue, bufB, CL_TRUE, 0, M * N * sizeof(*B1), B1, 0, NULL, NULL); } // Release OpenCL memory objects. clReleaseMemObject(bufB); clReleaseMemObject(bufA); // Finalize work with clAmdBlas. clAmdBlasTeardown(); // Release OpenCL working objects. clReleaseCommandQueue(queue); clReleaseContext(ctx); if (result != CL_SUCCESS) { printf("\nclAmdBlasStrmm() failed with %d\n", err); return result; } // Compute control result using the regular BLAS strmm. printf("Computing control result using CPU BLAS ... "); { char side = 'L'; char uploA = 'U'; char transA = 'N'; char diagA = 'N'; int m = (int)M, n = (int)N; int lda = m, ldb = n; strmm_(&side, &uploA, &transA, &diagA, &m, &n, &alpha, A2, &lda, B2, &ldb); } printf("done\n"); #define ABS(a) (((a) > 0) ? (a) : -(a)) // Compare AMD BLAS result with control result. printf("Comparing AMD BLAS result and control result ... "); for (int i = 0; i < M * M; i++) { if (ABS(A1[i] - A2[i]) > eps) { printf("A results differ @ %d : %f != %f\n", A1[i], A2[i]); break; } } for (int i = 0; i < M * N; i++) { if (ABS(B1[i] - B2[i]) > eps) { printf("B results differ @ %d : %f != %f\n", B1[i], B2[i]); break; } } printf("done\n"); // Output matrixes if they are small. if ((M <= 3) && (N <= 3)) { for (int i = 0; i < M; i++) { for (int j = 0; j < M; j++) printf("%f ", A1[i + j * M]); printf("\t"); for (int j = 0; j < N; j++) printf("%f ", B1[i + j * M]); printf("\n"); } } printf("\n"); if ((M <= 3) && (N <= 3)) { for (int i = 0; i < M; i++) { for (int j = 0; j < M; j++) printf("%f ", A2[i + j * M]); printf("\t"); for (int j = 0; j < N; j++) printf("%f ", B2[i + j * M]); printf("\n"); } } // Release A and B matrixes. free(A1); free(B1); free(A2); free(B2); return result; }

Outcomes