empty_knapsack

Anyone who was able to reach theoretical peak FLOPS possible with 6990?

Discussion created by empty_knapsack on Apr 10, 2011
Latest reply on Apr 15, 2011 by rollyng
Throttling issues?

Not mentioning problems with using both cores of 6990 with OpenCL I still cannot get full performance expected even with single core using.

For example, my application using only integer calculations. Looking at ALU utilization ratio and number of iterations it's possible to very closely predict final speed of calculations. However, it works only for 4xxx & 5xxx and doesn't works for 6990 (yes, I'm aware of VLIW4 there and it's taken into consideration), it running about 10% slower than it should.

Moreover, I've decided to check 6990 performance with simple FLOPS counter program I've wrote year ago (code attached). With 5770 it works like -- first calCtxRunProgramGrid() takes a little bit more time than following ones, I'm assuming it's "warm-up" run. All other calCtxRunProgramGrid() invocations takes more or less constant time.

2 func calls, #MAD = 5896, code size = 48272 : 1315.985 1315.742 1322.605 1321.548 1319.723 1321.008 1321.626 1321.317 1320.905 1321.265 1320.147 GFLOPS: 1320.588 97.10%


It's GFLOPS measured after each kernel invocation.

 

For 6990 it looks completely different. For default

830Mhz clock/Theoretical peak SPFP GFLOPS = 2549.760

2 func calls, #MAD = 5896, code size = 48400 : 2425.376 2417.973 2097.518 2103.431 2096.177 2093.983 2095.793 2094.881 2096.612 2092.956 2096.214 GFLOPS: 2128.554 83.48%

760Mhz/Theoretical peak SPFP GFLOPS = 2334.720

2 func calls, #MAD = 5896, code size = 48400 : 2228.522 2279.260 2138.564 2092.016 2098.774 2101.159 2096.866 2099.390 2096.705 2095.216 2100.299 GFLOPS: 2119.825 90.80%

725Mhz/Theoretical peak SPFP GFLOPS = 2227.200:

2 func calls, #MAD = 5896, code size = 48400 : 2132.960 2173.636 2163.264 2097.450 2093.711 2095.024 2093.902 2093.104 2095.259 2100.797 2094.045 GFLOPS: 2110.019 94.74%

700Mhz/Theoretical peak SPFP GFLOPS = 2150.400

2 func calls, #MAD = 5896, code size = 48400 : 2053.042 2098.898 2099.732 2100.816 2090.898 2094.478 1965.949 1855.290 2095.923 2101.776 2088.555 GFLOPS: 2059.232 95.76%

650Mhz/Theoretical peak SPFP GFLOPS = 1996.800

2 func calls, #MAD = 5896, code size = 48400 : 1912.701 1952.603 1942.070 1949.367 1951.344 1952.732 1952.393 1952.856 1949.598 1954.063 1953.557 GFLOPS: 1951.058 97.71%

 So only downclocking to 650-700Mhz results in ~matching theoretical and practical FLOPS value. As it looks like overheating problems I've manually increased fan speed to 100% (yeah, it's very noisy at this level!). However it doesn't change situation at all. Only 2-4 first kernel invocations works on expected speed, after that performance dropping to level of ~700Mhz.

 

My guess that it's internal throttling control implemented in 6990. And so claimed 830Mhz in reality became ~700Mhz no matter how good your cooling system is.

 

As it's really disappointing results I hope that I missed something and comments from AMD engineers are more than welcome.

/***************************************************************************** ATI GPU benchmarker/tester This code written by [Ivan Golubev, http://www.golubev.com] in 2010-2011 and placed in public domain *****************************************************************************/ // <windows.h> included only for // QueryPerformanceCounter() & QueryPerformanceFrequency() // to have good resolution timer #include <windows.h> #include <stdio.h> #include <assert.h> #include <conio.h> #include <stdlib.h> #include <cal.h> #include <calcl.h> // if there N GPUs at system -- define from 0 to N-1 #define DEVICENO 0 #define THREADS_PER_GROUP 64 // grid size #define DIM_X 2048 #define DIM_Y DIM_X // no of function calls #define NC_STARTS 2 #define NC_ENDS (NC_STARTS + 1) // no of MADS inside functions #define NMADS_STARTS 120 #define NMADS_ENDS (NMADS_STARTS + 256) #define NMADS_STEP 32 int madcounter; int codelen; static void __cdecl __logger(const CALchar *msg) { if (strstr(msg, ": MULADD")) madcounter++; if (strncmp(msg, "CodeLen", 7) == 0) { if (sscanf(msg, "CodeLen\t\t\t=%d;", &codelen) != 1) printf("Unknown Code Len: %s", msg); } // fprintf(stdout, msg); } void addline(char **p, int *npos, int *nmax, char *s) { int len = strlen(s); if ((*npos + len) > *nmax) { *nmax += 65536 + len; *p = (char *)realloc(*p, *nmax); } memcpy(*p + *npos, s, len); *npos += len; } // generate kernel with (<ncalls> * <nmads> * 4 * 4 + 8) MADs each char *genkernel(int ncalls, int nmads) { char s[1024]; char *pKernel = NULL; int npos = 0; int nmax = 0; addline(&pKernel, &npos, &nmax, "il_cs_2_0\n"); sprintf(s, "dcl_num_thread_per_group %d\n", THREADS_PER_GROUP); addline(&pKernel, &npos, &nmax, s); addline(&pKernel, &npos, &nmax, "\n"); addline(&pKernel, &npos, &nmax, "dcl_literal l1, 1.0, 2.0, 3.0, 4.0\n"); addline(&pKernel, &npos, &nmax, "dcl_literal l2, 4.0, 2.0, 3.7, 4.7\n"); addline(&pKernel, &npos, &nmax, "dcl_literal l3, 1.1, 7.0, 8.0, 9.0\n"); addline(&pKernel, &npos, &nmax, "dcl_literal l4, 1.2, 2.0, 3.4, 4.2\n"); addline(&pKernel, &npos, &nmax, "\n"); addline(&pKernel, &npos, &nmax, "mov r10.x,vaTid0.x\n"); addline(&pKernel, &npos, &nmax, "itof r0.x,r10.x\n"); addline(&pKernel, &npos, &nmax, "add r0.y,r0.x,l1.y\n"); addline(&pKernel, &npos, &nmax, "add r0.z,r0.x,l1.z\n"); addline(&pKernel, &npos, &nmax, "add r0.w,r0.x,l1.w\n"); addline(&pKernel, &npos, &nmax, "add r1,r0,l2\n"); addline(&pKernel, &npos, &nmax, "add r2,r1,l3\n"); addline(&pKernel, &npos, &nmax, "add r3,r1,l4\n"); addline(&pKernel, &npos, &nmax, "add r10,r0,l4\n"); addline(&pKernel, &npos, &nmax, "add r11,r1,l3\n"); addline(&pKernel, &npos, &nmax, "add r12,r2,l2\n"); addline(&pKernel, &npos, &nmax, "add r13,r3,l1\n"); addline(&pKernel, &npos, &nmax, "\n"); for (int i=0; i<ncalls; i++) addline(&pKernel, &npos, &nmax, "call 10\n"); addline(&pKernel, &npos, &nmax, "mad r0,r0,r2,r3\n"); addline(&pKernel, &npos, &nmax, "mad r0,r0,r10,r12\n"); addline(&pKernel, &npos, &nmax, "\n"); addline(&pKernel, &npos, &nmax, "mov r5,cb0[0]\n"); addline(&pKernel, &npos, &nmax, "ieq r4,r0,r5\n"); addline(&pKernel, &npos, &nmax, "ieq r14,r10,r5\n"); addline(&pKernel, &npos, &nmax, "ior r4,r4,r14\n"); addline(&pKernel, &npos, &nmax, "ior r6.x,r4.x,r4.y\n"); addline(&pKernel, &npos, &nmax, "ior r6.z,r4.z,r4.w\n"); addline(&pKernel, &npos, &nmax, "ior r6.x,r6.x,r6.z\n"); addline(&pKernel, &npos, &nmax, "\n"); addline(&pKernel, &npos, &nmax, "if_logicalnz r4.x\n"); addline(&pKernel, &npos, &nmax, " mov g[0].x___,r0.x\n"); addline(&pKernel, &npos, &nmax, " mov g[0]._y__,r10.x\n"); addline(&pKernel, &npos, &nmax, " mov g[1],r4\n"); addline(&pKernel, &npos, &nmax, "endif\n"); addline(&pKernel, &npos, &nmax, "\n"); addline(&pKernel, &npos, &nmax, "endmain\n"); addline(&pKernel, &npos, &nmax, "\n"); addline(&pKernel, &npos, &nmax, "func 10\n"); for (int i=0; i<nmads; i++) { addline(&pKernel, &npos, &nmax, "mad r0,r0,r0,r1\n"); addline(&pKernel, &npos, &nmax, "mad r2,r2,r2,r3\n"); addline(&pKernel, &npos, &nmax, "mad r10,r10,r10,r11\n"); addline(&pKernel, &npos, &nmax, "mad r12,r12,r12,r13\n"); } addline(&pKernel, &npos, &nmax, "ret\n"); addline(&pKernel, &npos, &nmax, "\n"); addline(&pKernel, &npos, &nmax, "end\n"); if ((npos + 1) > nmax) { nmax += 16; pKernel = (char *)realloc(pKernel, nmax); } pKernel[npos] = 0; return pKernel; } int main(int argc, char** argv) { if (calInit() != CAL_RESULT_OK) return 1; { CALuint major, minor, imp; calGetVersion(&major, &minor, &imp); printf("CAL v%d.%d.%d\n", major, minor, imp); calclGetVersion(&major, &minor, &imp); printf("Compiler v%d.%d.%d\n", major, minor, imp); } int deviceno = DEVICENO; CALuint numDevices = 0; if (calDeviceGetCount(&numDevices) != CAL_RESULT_OK) return 1; printf("%d device(s) found.\n", numDevices); CALdevice device = 0; if (calDeviceOpen(&device, deviceno) != CAL_RESULT_OK) { printf("calDeviceOpen() failed.\n"); return 1; } CALdeviceinfo info; if (calDeviceGetInfo(&info, deviceno) != CAL_RESULT_OK) { printf("calDeviceGetInfo() failed.\n"); return 1; } CALcontext ctx = 0; calCtxCreate(&ctx, device); CALdeviceattribs attr; attr.struct_size = sizeof(attr); if (calDeviceGetAttribs(&attr, deviceno) != CAL_RESULT_OK) { attr.engineClock = 0; attr.numberOfSIMD = 0; } printf("%d SIMD %d clock, threads per group = %d\n", attr.numberOfSIMD, attr.engineClock, THREADS_PER_GROUP); int nalu = 5; if (info.target == CAL_TARGET_CAYMAN) nalu = 4; // 2 ops * # of SIMD * # TP per SIMD * # ALUs per TP * engine clock in Ghz double peakgflops = 2 * attr.numberOfSIMD * 16 * nalu * attr.engineClock / 1000.0; if (info.target == CAL_TARGET_710 || info.target == CAL_TARGET_730 || info.target == CAL_TARGET_CEDAR || info.target == CAL_TARGET_WRESTLER) peakgflops /= 2; // they have only 8 thread processors per SIMD printf("Theoretical peak SPFP GFLOPS = %.3lf\n", peakgflops); CALobject obj = NULL; CALimage image = NULL; CALlanguage lang = CAL_LANGUAGE_IL; int ncalls; int nmads; char *pKernel = NULL; CALresource localRes = 0; CALresource constRes = 0; CALmem localMem = 0; CALmem constMem = 0; if (calResAllocLocal2D(&localRes, device, DIM_X, DIM_Y, CAL_FORMAT_UINT_4, CAL_RESALLOC_GLOBAL_BUFFER) != CAL_RESULT_OK) { printf("Error Local2D [%s]\n", calGetErrorString()); } if (calResAllocLocal1D(&constRes, device, 4, CAL_FORMAT_UINT_4, 0) != CAL_RESULT_OK) { printf("Error Local1D [%s]\n", calGetErrorString()); return 1; } unsigned int* constPtr = NULL; CALuint constPitch = 0; calResMap((CALvoid**)&constPtr, &constPitch, constRes, 0); constPtr[ 0] = constPtr[ 1] = constPtr[ 2] = constPtr[ 3] = -12345789.123f; calResUnmap(constRes); calCtxGetMem(&localMem, ctx, localRes); calCtxGetMem(&constMem, ctx, constRes); // main cycle for (ncalls = NC_STARTS; ncalls <= NC_ENDS; ncalls++) for (nmads = NMADS_STARTS; nmads < NMADS_ENDS; nmads += NMADS_STEP) { pKernel = genkernel(ncalls, nmads); /* if (ncalls == 2 && nmads == 184) { FILE *f = fopen("kernel.il", "w"); fwrite(pKernel, 1, strlen(pKernel), f); fclose(f); } */ if (calclCompile(&obj, lang, pKernel, info.target) != CAL_RESULT_OK) { fprintf(stdout, "Kernel compilation failed. Exiting.\n"); return 1; } if (calclLink(&image, &obj, 1) != CAL_RESULT_OK) { fprintf(stdout, "Kernel linking failed. Exiting.\n"); return 1; } free(pKernel); madcounter = 0; calclDisassembleImage(image, (CALLogFunction)__logger); printf("%d func calls, #MAD = %d, code size = %d : ", ncalls, madcounter, codelen); if (madcounter != ncalls * nmads * 4 * 4 + 8) printf("***Number of MADs in compiled kernel doesn't match expected value***\n"); CALmodule module = 0; calModuleLoad(&module, ctx, image); CALfunc func = 0; CALname constName = 0; CALname localName = 0; calModuleGetEntry(&func, ctx, module, "main"); calModuleGetName(&constName, ctx, module, "cb0"); if (calModuleGetName(&localName, ctx, module, "g[]") != CAL_RESULT_OK) { printf("Error in getname [%s]\n", calGetErrorString()); } calCtxSetMem(ctx, localName, localMem); calCtxSetMem(ctx, constName, constMem); // run kernel for 10x times and get average flops value int counter = 0; int countermax = 10; double avflops = 0; do { CALprogramGrid pg; pg.func = func; pg.flags = 0; pg.gridBlock.width = THREADS_PER_GROUP; pg.gridBlock.height = 1; pg.gridBlock.depth = 1; pg.gridSize.width = (DIM_X * DIM_Y + pg.gridBlock.width - 1) / pg.gridBlock.width; pg.gridSize.height = 1; pg.gridSize.depth = 1; LARGE_INTEGER qFrequency, qStart, qEnd; QueryPerformanceFrequency(&qFrequency); QueryPerformanceCounter(&qStart); CALevent e = 0; if (calCtxRunProgramGrid(&e, ctx, &pg) != CAL_RESULT_OK) { printf("error in run [%s]\n", calGetErrorString()); return 1; } while (calCtxIsEventDone(ctx, e) == CAL_RESULT_PENDING); QueryPerformanceCounter(&qEnd); double OpsCount = ((double)(madcounter * 2)) * DIM_X * DIM_Y; double ElapsedTime = double(qEnd.QuadPart - qStart.QuadPart) / qFrequency.QuadPart; double GFlops = OpsCount / ElapsedTime / 1e9; printf("%.3lf ", GFlops); // exclude first execution as warm-up run if (counter) avflops += GFlops; if (++counter > countermax) break; } while (1); printf("GFLOPS: %.3lf %.2lf%%\n", avflops / (counter - 1), avflops * 100.0 / (peakgflops * (counter - 1)) ); calModuleUnload(ctx, module); calclFreeImage(image); calclFreeObject(obj); } calCtxReleaseMem(ctx, constMem); calCtxReleaseMem(ctx, localMem); calResFree(constRes); calResFree(localRes); calCtxDestroy(ctx); calDeviceClose(device); calShutdown(); return 0; }

Outcomes