11 Replies Latest reply on Dec 6, 2010 4:39 PM by ryta1203

    Relation between FetchSize and FetchInsts in Stream Profiler

    dravisher
        • Relation between FetchSize and FetchInsts in Stream Profiler
          himanshu.gautam

          Dravisher,

          Please post the kernel code.

            • Relation between FetchSize and FetchInsts in Stream Profiler
              dravisher

              Here's the kernel code. The profiler output is for the LocalMem_Kernel kernel.

              //use double (64-bit) if floatTypeIsDouble is defined, else use float (32-bit). #ifdef floatTypeIsDouble #define floattype double #define floattypevec2 double2 #define floattypevec3 double3 #else #define floattype float #define floattypevec2 float2 #define floattypevec3 float3 #endif uint idx(uint3 gid){ //This function returns the flat index in a flattened 3D array corresponding to the //three provided coordinates. return gid.x + get_global_size(0) * gid.y + get_global_size(0) * get_global_size(1) * gid.z; } uint idxL(uint2 coord){ //This function returns the flat index in a flattened 2D array for use with a (wgSizeLocalMem_0)*(wgSizeLocalMem_1) matrix. return coord.x + wgSizeLocalMem_0 * coord.y; } __kernel void Initialization_Kernel(global floattype *dataArr) { //This kernel sets the initial wave function to be an initial distubance only in the centre of the domain uint3 gid = (uint3)(get_global_id(0), get_global_id(1), get_global_id(2)); //Assuming domain is 0-1 inclusive in all directions. +2 is because boundaries are not included in the execution domain. floattypevec3 rvec = (floattypevec3)((gid.x+1)*h, (gid.y+1)*h, (gid.z+1)*h); //Distance squared from center floattype r = (rvec.x - 0.5) * (rvec.x - 0.5) + (rvec.y - 0.5) * (rvec.y - 0.5) + (rvec.z - 0.5) * (rvec.z - 0.5); //This function is just a peak near the center floattype func = 1.0 - 100 * r; //If func is less than 0, set to 0. Else set to func. dataArr[idx(gid)] = (func < 0) ? 0 : func; } __kernel void Fictitious_Points_Kernel(global floattype *dataArrFictitious, global floattype *dataArrCurrent) { //This kernel creates the fictitious points needed for the first timestep. uint3 gid = (uint3)(get_global_id(0), get_global_id(1), get_global_id(2)); uint3 gidtemp = gid; floattype myval, myvalXp1, myvalXm1, myvalYp1, myvalYm1, myvalZp1, myvalZm1; //Download my value myval = dataArrCurrent[idx(gid)]; //Download values in x-direction gidtemp.x = gid.x - 1; myvalXm1 = gid.x ? dataArrCurrent[idx(gidtemp)] : 0; gidtemp.x = gid.x + 1; myvalXp1 = (gid.x + 1 - get_global_size(0)) ? dataArrCurrent[idx(gidtemp)] : 0; gidtemp.x = gid.x; //Download values in y-direction gidtemp.y = gid.y - 1; myvalYm1 = gid.y ? dataArrCurrent[idx(gidtemp)] : 0; gidtemp.y = gid.y + 1; myvalYp1 = (gid.y + 1 - get_global_size(1)) ? dataArrCurrent[idx(gidtemp)] : 0; gidtemp.y = gid.y; //Download values in z-direction gidtemp.z = gid.z - 1; myvalZm1 = gid.z ? dataArrCurrent[idx(gidtemp)] : 0; gidtemp.z = gid.z + 1; myvalZp1 = (gid.z + 1 - get_global_size(2)) ? dataArrCurrent[idx(gidtemp)] : 0; gidtemp.z = gid.z; dataArrFictitious[idx(gid)] = myval + 0.5 * CourantNumber * CourantNumber * (myvalXp1 + myvalXm1 + myvalYp1 + myvalYm1 + myvalZp1 + myvalZm1 - 6 * myval); } __kernel void Simple_Kernel(global floattype *dataArrPrevious, global floattype *dataArrCurrent) { //This kernel is the "simple" implementation with no local memory usage. Every work-item //downloads it's own neighbouring values. This is not expected to be efficient on any //GPU. uint3 gid = (uint3)(get_global_id(0), get_global_id(1), get_global_id(2)); uint3 gidtemp = gid; floattype myval, myvalXp1, myvalXm1, myvalYp1, myvalYm1, myvalZp1, myvalZm1; //Download values. myval is the value "belonging" to this kernel (i.e. the one being updated). myvalXp1 is the value at x+1, while myvalXm1 is at x-1. //The same goes for myvalYp1 and so on. //Download my value myval = dataArrCurrent[idx(gid)]; //Download values in x-direction gidtemp.x = gid.x - 1; myvalXm1 = gid.x ? dataArrCurrent[idx(gidtemp)] : 0; gidtemp.x = gid.x + 1; myvalXp1 = (gid.x + 1 - get_global_size(0)) ? dataArrCurrent[idx(gidtemp)] : 0; gidtemp.x = gid.x; //Download values in y-direction gidtemp.y = gid.y - 1; myvalYm1 = gid.y ? dataArrCurrent[idx(gidtemp)] : 0; gidtemp.y = gid.y + 1; myvalYp1 = (gid.y + 1 - get_global_size(1)) ? dataArrCurrent[idx(gidtemp)] : 0; gidtemp.y = gid.y; //Download values in z-direction gidtemp.z = gid.z - 1; myvalZm1 = gid.z ? dataArrCurrent[idx(gidtemp)] : 0; gidtemp.z = gid.z + 1; myvalZp1 = (gid.z + 1 - get_global_size(2)) ? dataArrCurrent[idx(gidtemp)] : 0; gidtemp.z = gid.z; //Calculate the new stencil and upload dataArrPrevious[idx(gid)] = 2 * myval - dataArrPrevious[idx(gid)] + CourantNumber * CourantNumber * (myvalXp1 + myvalXm1 + myvalYp1 + myvalYm1 + myvalZp1 + myvalZm1 - 6 * myval); } __kernel void LocalMem_Kernel(global floattype *dataArrPrevious, global floattype *dataArrCurrent) { //This kernel is the local memory implementation that tries to do memory operations //in an efficient manner (especially for GPUs). //Work-items share data in the x-y plane, but download data for themselves in the z direction. //Local array to hold the shared stencil data (including values at the boundary). local floattype sharedStencils[wgSizeLocalMem_0 * wgSizeLocalMem_1]; uint3 gid = (uint3)(get_global_id(0), get_global_id(1), get_global_id(2)); uint2 lid = (uint2)(get_local_id(0), get_local_id(1)); floattype myval, myvalXp1, myvalXm1, myvalYp1, myvalYm1, myvalZp1, myvalZm1; //Download values. myvalZp1 is the value at z+1, while myvalZm1 is at z-1. //Download my value myval = sharedStencils[idxL(lid)] = dataArrCurrent[idx(gid)]; //Ensure all work-items have finished downloading values barrier(CLK_LOCAL_MEM_FENCE); //Handle boundary values //negative x-direction if(gid.x == 0) //I am globally to the left, so boundary value is clamped at 0. myvalXm1 = 0; else if(lid.x == 0) //I am just the leftmost in my group, so I need to download the value myvalXm1 = dataArrCurrent[idx((uint3)(gid.x-1, gid.y, gid.z))]; else //I am inside a group, get the value from local memory myvalXm1 = sharedStencils[idxL((uint2)(lid.x-1, lid.y))]; //positive x-direction if(gid.x == get_global_size(0) - 1) //I am globally to the right, so boundary value is clamped at 0. myvalXp1 = 0; else if(lid.x == wgSizeLocalMem_0 - 1) //I am just the rightmost in my group, so I need to download the value myvalXp1 = dataArrCurrent[idx((uint3)(gid.x+1, gid.y, gid.z))]; else //I am inside a group, get the value from local memory myvalXp1 = sharedStencils[idxL((uint2)(lid.x+1, lid.y))]; //negative y-direction if(gid.y == 0) //I am globally at the top, so boundary value is clamped at 0. myvalYm1 = 0; else if(lid.y == 0) //I am just the toppmost in my group, so I need to download the value myvalYm1 = dataArrCurrent[idx((uint3)(gid.x, gid.y-1, gid.z))]; else //I am inside a group, get the value from local memory myvalYm1 = sharedStencils[idxL((uint2)(lid.x, lid.y-1))]; //positive y-direction if(gid.y == get_global_size(1) - 1) //I am globally at the bottom, so boundary value is clamped at 0. myvalYp1 = 0; else if(lid.y == wgSizeLocalMem_1 - 1) //I am just the lowermost in my group, so I need to download the value myvalYp1 = dataArrCurrent[idx((uint3)(gid.x, gid.y+1, gid.z))]; else //I am inside a group, get the value from local memory myvalYp1 = sharedStencils[idxL((uint2)(lid.x, lid.y+1))]; //negative z-direction if(gid.z == 0) //I am globally at the front, so boundary value is clamped at 0. myvalZm1 = 0; else //I am not at the front, need to download value myvalZm1 = dataArrCurrent[idx((uint3)(gid.x, gid.y, gid.z-1))]; //positive z-direction if(gid.z == get_global_size(2) - 1) //I am globally at the back, so boundary value is clamped at 0. myvalZp1 = 0; else //I am not at the back, need to download value myvalZp1 = dataArrCurrent[idx((uint3)(gid.x, gid.y, gid.z+1))]; //Calculate new stencil and upload dataArrPrevious[idx(gid)] = 2 * myval - dataArrPrevious[idx(gid)] + CourantNumber * CourantNumber * (myvalXp1 + myvalXm1 + myvalYp1 + myvalYm1 + myvalZp1 + myvalZm1 - 6 * myval); }

            • Relation between FetchSize and FetchInsts in Stream Profiler
              dravisher

              Yeah I'm seeing 100 or almost 100 cache hit for kernels that always read unique data. And I'm still seeing way too low values for FetchSize. For instance the attached kernel launched with 20480 work-items is correctly reported to have 512 read instructions, but while FetchSize should be (20480 work-items)*(4 bytes/fetch) * (512 fetches) = 40960 KiB, it is reported by the profiler to be 12872 kilobytes. CacheHit is 92.93.

              __kernel void kernel_scalar_linear(global float *scalarBuff, global float *out) { int gid = get_global_id(0); int gsize = get_global_size(0); float sum = 0; for(uint i=0; i<512; i++) sum += scalarBuff[gid + gsize * i]; out[gid] = sum; }