7 Replies Latest reply on May 27, 2010 6:55 AM by Fuxianjun

    Help Me Understand The Matrix Multiplication Sample Kernel!

    Fuxianjun

      I have many problems about the matrix mulplication sample.I appreciate you to help me answer the following questions.

      1,Please help me understand the matrix multiplication sample kernel! It's better to explain in detail.

      2.When use this function clGetKernelWorkGroupInfo() to query CL_KERNEL_WORK_GROUP_SIZE in the matrix multiplication sample kernel, it return 256, dose it means that if I query CL_KERNEL_WORK_GROUP_SIZE in anoter kernel with the parameter of float type(in the sample kernel,the parameter type is float4) , the return will be 1024 because of float4 consist of 4 floats?

      3. Dose  CL_KERNEL_WORK_GROUP_SIZE means how many work-items a work-group contains ? In the case of 2D calculation, if CL_KERNEL_WORK_GROUP_SIZE is 256, dose it means the blocksize must be 16, and there are 16*16 work-items in the work-group?

      4.When I change the size of the two multiplied matrix to be not multiples of blocksize ,it returns a zero matrix, why ?  How does OPENCL allocate input matrix into work-group or work-item?

        • Help Me Understand The Matrix Multiplication Sample Kernel!
          nou

          2. no. CL_KERNEL_WORK_GROUP_SIZE depend only on resource usage such as registry.

          3. yes. it means maximum of work items in workgroup

          4. global work size must be multiple of block size. if not it is error and clEnqueueNDRande() should return CL_INVALID_WORK_GROUP_SIZE

            • Help Me Understand The Matrix Multiplication Sample Kernel!
              Fuxianjun

               

              Originally posted by: nou 2. no. CL_KERNEL_WORK_GROUP_SIZE depend only on resource usage such as registry.

              3. yes. it means maximum of work items in workgroup

              4. global work size must be multiple of block size. if not it is error and clEnqueueNDRande() should return CL_INVALID_WORK_GROUP_SIZE

              I want to use matrix with any size to multiply,what can I do? If matrix size is not multiple of block size(assume it to be 4), I fill up it with zeroes until the size is proper:

              123

              245

              fill up with zero:

              1230

              2450

              0000

              0000

              but it still return an entire zero matrix. why ?

            • Help Me Understand The Matrix Multiplication Sample Kernel!
              Fuxianjun

              Here's the kernel.

              This line:int temp = widthA / 4; , dose temp means how many float4 in one row of matrix A ?

              This line:for(int i = 0; i < (temp / get_local_size(0)); i++) , dose it means the number of float4 in a work-item is temp / get_local_size(0)  ?

               

              _kernel void mmmKernel_local(__global float4 *matrixA, __global float4 *matrixB, __global float4* matrixC, int widthA, __local float4 *blockA) { int blockPos = get_local_id(0) + get_local_size(0) * (get_local_id(1) << TILEY_SHIFT); //Should be : localId * (TILEX / 4) (float4) /* Position of thread will be according to the number of values it writes i.e TILE size */ int globalPos = get_global_id(0) + (get_global_id(1) << TILEY_SHIFT) * get_global_size(0); /* Each thread writes 4 float4s */ float4 sum0 = (float4)(0); float4 sum1 = (float4)(0); float4 sum2 = (float4)(0); float4 sum3 = (float4)(0); int temp = widthA / 4; /* This loop runs for number of blocks of A in horizontal direction */ for(int i = 0; i < (temp / get_local_size(0)); i++) { /* Calculate global ids of threads from the particular block to load from matrix A depending on i */ int globalPosA = i * get_local_size(0) + get_local_id(0) + (get_global_id(1) << TILEY_SHIFT) * temp; /* Load values in blockA from matrixA */ blockA[blockPos] = matrixA[globalPosA]; blockA[blockPos + get_local_size(0)] = matrixA[globalPosA + temp]; blockA[blockPos + 2 * get_local_size(0)] = matrixA[globalPosA + 2 * temp]; blockA[blockPos + 3 * get_local_size(0)] = matrixA[globalPosA + 3 * temp]; barrier(CLK_LOCAL_MEM_FENCE); /* Calculate global ids of threads from the particular block to load from matrix B depending on i */ int globalPosB = get_global_id(0) + ((i * get_local_size(0)) << TILEY_SHIFT) * get_global_size(0); /* This loop runs for number of threads in horizontal direction in the block of A */ for(int j = 0; j < get_local_size(0) * 4; j=j+4) { /* Load 4 float4s from blockA : access patters = strided from local memory */ float4 tempA0 = blockA[(j >> 2) + get_local_id(1) * TILEY * get_local_size(0)]; float4 tempA1 = blockA[(j >> 2) + (get_local_id(1) * TILEY + 1) * get_local_size(0)]; float4 tempA2 = blockA[(j >> 2) + (get_local_id(1) * TILEY + 2) * get_local_size(0)]; float4 tempA3 = blockA[(j >> 2) + (get_local_id(1) * TILEY + 3) * get_local_size(0)]; /* Load corresponding values from matrixB, access pattern = linear from global memory */ float4 tempB0 = matrixB[globalPosB + j * get_global_size(0)]; //Should be localId.x * (TILEX / 4) float4 tempB1 = matrixB[globalPosB + (j + 1) * get_global_size(0)]; float4 tempB2 = matrixB[globalPosB + (j + 2) * get_global_size(0)]; float4 tempB3 = matrixB[globalPosB + (j + 3) * get_global_size(0)]; sum0.x += tempA0.x * tempB0.x + tempA0.y * tempB1.x + tempA0.z * tempB2.x + tempA0.w * tempB3.x; sum0.y += tempA0.x * tempB0.y + tempA0.y * tempB1.y + tempA0.z * tempB2.y + tempA0.w * tempB3.y; sum0.z += tempA0.x * tempB0.z + tempA0.y * tempB1.z + tempA0.z * tempB2.z + tempA0.w * tempB3.z; sum0.w += tempA0.x * tempB0.w + tempA0.y * tempB1.w + tempA0.z * tempB2.w + tempA0.w * tempB3.w; sum1.x += tempA1.x * tempB0.x + tempA1.y * tempB1.x + tempA1.z * tempB2.x + tempA1.w * tempB3.x; sum1.y += tempA1.x * tempB0.y + tempA1.y * tempB1.y + tempA1.z * tempB2.y + tempA1.w * tempB3.y; sum1.z += tempA1.x * tempB0.z + tempA1.y * tempB1.z + tempA1.z * tempB2.z + tempA1.w * tempB3.z; sum1.w += tempA1.x * tempB0.w + tempA1.y * tempB1.w + tempA1.z * tempB2.w + tempA1.w * tempB3.w; sum2.x += tempA2.x * tempB0.x + tempA2.y * tempB1.x + tempA2.z * tempB2.x + tempA2.w * tempB3.x; sum2.y += tempA2.x * tempB0.y + tempA2.y * tempB1.y + tempA2.z * tempB2.y + tempA2.w * tempB3.y; sum2.z += tempA2.x * tempB0.z + tempA2.y * tempB1.z + tempA2.z * tempB2.z + tempA2.w * tempB3.z; sum2.w += tempA2.x * tempB0.w + tempA2.y * tempB1.w + tempA2.z * tempB2.w + tempA2.w * tempB3.w; sum3.x += tempA3.x * tempB0.x + tempA3.y * tempB1.x + tempA3.z * tempB2.x + tempA3.w * tempB3.x; sum3.y += tempA3.x * tempB0.y + tempA3.y * tempB1.y + tempA3.z * tempB2.y + tempA3.w * tempB3.y; sum3.z += tempA3.x * tempB0.z + tempA3.y * tempB1.z + tempA3.z * tempB2.z + tempA3.w * tempB3.z; sum3.w += tempA3.x * tempB0.w + tempA3.y * tempB1.w + tempA3.z * tempB2.w + tempA3.w * tempB3.w; } } /* Write 16 values to matrixC */ matrixC[globalPos] = sum0; matrixC[globalPos + get_global_size(0)] = sum1; matrixC[globalPos + 2 * get_global_size(0)] = sum2; matrixC[globalPos + 3 * get_global_size(0)] = sum3; }