Archives Discussions

crucified · ‎01-18-2012

Need help with OpenCL code

I got extremely poor results in performance

Can you tell me what am I doing wrong?

The code is attached. If necessary, I could provide with cpp file and input file

// Calc.cpp : Defines the exported functions for the DLL application. // #include <iostream> #include <fstream> #include <vector> #include <complex> #include <CL/cl.hpp> #include <map> #include <omp.h> #include "linkedlist.h" #define EXPORT extern "C" __declspec (dllexport) #define eps 0.001 //???????? ??? ??????? ???? (?? ?????????. ????? ????? ???? ??? ???????????? ???????) #define geps 0.1 //???????? ????? //????????????? ???? #define TIP_USEL_BALANS 6 //???? ? ????????????? ??????? ?????????? #define TIP_USEL_CONST 3 //?????? ???? #define TIP_USEL_NO 0 //???? ? ?????????? ?? Qmin #define TIP_USEL_QMIN 4 //???? ? ?????????? ?? Qmax #define TIP_USEL_QMAX 5 //??????: //??? ??? #define ERR_OK 0 //?????? ????????? ?????? #define ERR_DATA_PARSE_ERROR -1 //?????? ???????? ?????????? #define ERR_CREATE_FROM_SOURCE -2 //?????? ?????? #define ERR_BUILD_PROGRAM -3 //?????? ???????? ???? #define ERR_CREATE_KERNEL -4 struct NebalansData { int nodeType; long double Qmax; long double Qmin; double Udefault; NebalansData(int type, long double Qmax, long double Qmin, double U) { this->nodeType = type; this->Qmax = Qmax; this->Qmin = Qmin; this->Udefault = U; } }; struct ecomplex { long double Re, Im; }; #pragma pack(2) struct Trdy { long Nqy; ecomplex Y; Trdy *next; }; struct Trdj { long Nqj; double j11, j12, j21, j22; Trdj *next; }; struct Usl{ int Nusl, Nrp, Itipy, Indy, Niyp, ipr; Trdy *Nhp; Trdj *Nhj, *Nhtw, *NKTW; long double Unom, Pn, Qn, Pn1, Qn1, Pg1, Qg1, Umod, Qmin, Qmax, Py, Qy, V, Vc, DV, Delta, Dc, DDelta, SUmod; ecomplex S, R, RT, U, SYc, SYN, SYR; }; struct MyStruct{ double x; MyStruct *next; }; #pragma region "Kernel" const char *source = "__kernel void Nebalans(" " __constant int *N," " __global float *U," " __global float *delta," " __global float *g_vals, " " __global float *b_vals, " " __global int *g_cols," " __global int *edges," " __global float *P," " __global float *Q," " __global float *dP, " " __global float *dQ) " "{ " " int i; "//?????? (? ???) " int j; " //?????? ???? (??????? ?????) " int index_i; " //????????? ?? ??????, ??? ???????? ???????????? ???????  " i = get_global_id(0); " //????? ?????? ????? ????????????, global id " int N2 = edges[i+1] - edges;" //???????? ???-?? ????????? ??? ????????? ? ?????? // //?????????? ?????? ??? ????????????? ???????? " index_i = edges; " " dP = U * U * g_vals[index_i]; " " dQ = -1 * U * U * b_vals[index_i]; " " for (j = 1; j < N2; j++) " //????? ?? i-? ??????, j - ?????? ?????? " { " " dP = mad(U * U[g_cols[edges + j]] , g_vals[edges + j] * native_cos(delta - delta[g_cols[edges + j]]) + b_vals[edges + j] * native_sin(delta - delta[g_cols[edges + j]]), dP); " " dQ = mad(U * U[g_cols[edges + j]] , g_vals[edges + j] * native_sin(delta - delta[g_cols[edges + j]]) - b_vals[edges + j] * native_cos(delta - delta[g_cols[edges + j]]), dQ); " " } " " dP -= P;" " dQ -= Q;" " dP = -dP; dQ=-dQ;" "} " "__kernel void JacobiForm( " " __constant int *N," " __global float *U," " __global float *delta," " __global float *g_vals," //2D " __global float *b_vals," //2D " __global int *g_cols," " __global int *edges," " __global float *dPdDelta," //2D " __global float *dPdU," //2D " __global float *dQdDelta," //2D " __global float *dQdU)" //2D "{" " int i;" //?????? (? ???) " int j;" //?????? ???? (??????? ?????) " int index_i;" //????????? ?? ??????, ??? ???????? ???????????? ???????  " i = get_global_id(0);" //????? ?????? ????? ????????????, global id " int N2 = edges[i+1] - edges;" //???????? ???-?? ????????? ??? ????????? ? ?????? // //?????????? ?????? ??? ????????????? ???????? " index_i = edges; " " dPdDelta[index_i] = 0.0;" " dPdU[index_i] = 2 * U * g_vals[index_i];" " dQdDelta[index_i] = 0.0;" " dQdU[index_i] = 2 * U * b_vals[index_i];" " for (j = 1; j < N2; j++)" //????? ?? i-? ??????, j - ?????? ??????? " {" " dPdDelta[index_i] = mad(U * U[g_cols[edges + j]], b_vals[edges + j] * native_cos(delta - delta[g_cols[edges + j]]) - g_vals[edges + j] * native_sin(delta - delta[g_cols[edges + j]]), dPdDelta[index_i]);" " dPdDelta[index_i + j] = -1 * U * U[g_cols[edges + j]] * (b_vals[edges + j] * native_cos(delta - delta[g_cols[edges + j]]) - g_vals[edges + j] * native_sin(delta - delta[g_cols[edges + j]]));" " dPdU[index_i] = mad(U[g_cols[edges + j]] , g_vals[edges + j] * native_cos(delta - delta[g_cols[edges + j]]) + b_vals[edges + j] * native_sin(delta - delta[g_cols[edges + j]]), dPdU[index_i]);" " dPdU[index_i + j] = U * (g_vals[edges + j] * native_cos(delta - delta[g_cols[edges + j]]) + b_vals[edges + j] * native_sin(delta - delta[g_cols[edges + j]]));" " dQdDelta[index_i] = mad(U * U[g_cols[edges + j]] , g_vals[edges + j] * native_cos(delta - delta[g_cols[edges + j]]) + b_vals[edges + j] * native_sin(delta - delta[g_cols[edges + j]]), dQdDelta[index_i]);" " dQdDelta[index_i + j] = -1 * U * U[g_cols[edges + j]] * (g_vals[edges + j] * native_cos(delta - delta[g_cols[edges + j]]) + b_vals[edges + j] * native_sin(delta - delta[g_cols[edges + j]]));" " dQdU[index_i] = mad(U[g_cols[edges + j]] , b_vals[edges + j] * native_cos(delta - delta[g_cols[edges + j]]) - g_vals[edges + j] * native_sin(delta - delta[g_cols[edges + j]]), dQdU[index_i]);" " dQdU[index_i + j] = -1 * U * (b_vals[edges + j] * native_cos(delta - delta[g_cols[edges + j]]) - g_vals[edges + j] * native_sin(delta - delta[g_cols[edges + j]]));" " }" " for (j = 0; j < N2; j++)" //????? ?? i-? ??????, j - ?????? ??????? " {" " dPdDelta[index_i + j] = -dPdDelta[index_i + j];" " dPdU[index_i + j] = -dPdU[index_i + j];" " dQdDelta[index_i + j] = -dQdDelta[index_i+j];" " dQdU[index_i + j] = -dQdU[index_i + j];" " }" " dQdU[index_i] = -dQdU[index_i];" "}" "__kernel void sAXPY_cond(" " __constant int *N," " __global float *a," //????? " __global float *x," //????. ?????? " __global int *xcols," //??????? ????. ??????? " __global float *y," //????????. ?????? " __global int *k)" //?????????? ??? ??????? "{" " float alpha = *a;" //??? ???? ??????? " int _k = *k;" //? ???...?? ????, ????? ?? ?????? ???? ????? ?????? ??????????????? " int i = get_global_id(0);" " if (xcols > _k)" " y[xcols] += (alpha * x);"//, y[xcols]);" "}"; #pragma endregion //*************non GPU********************** double norm2(int N, float *A) { double result = 0; #pragma omp parallel for schedule(runtime) reduction(+: result) for (int i = 0; i < N; i++) result += A * A; return sqrt(result); } double vecprod(int N, float *A, float *B) { double result = 0; #pragma omp parallel for schedule(runtime) reduction(+: result) for (int i = 0; i < N; i++) result += A * B; return result; } //****************************************** //?????????? ? ???????????? ? ?????????? ?? ???????? ??????? ???????? int NebalansCheck(int N, float *ndP, float *ndQ, std::map<int, NebalansData>& nebalansData) { double deltai = 0.0, deltai_1 = 0.0; int i = 0; int nodeType; double Usetval; //???????? ????????? ?????????? while (i < N) { //???????? ??? ?????? std::map<int,NebalansData>::iterator& it = nebalansData.find(i); //???? ? ????? ? ?????????? ?? ???????, ?? TIP_USEL_NO, ????? - ??? ???? ?? ??????? ???? nodeType = (it == nebalansData.end()) ? TIP_USEL_NO : it->second.nodeType; switch (nodeType) { case TIP_USEL_NO: { deltai += fabs(ndP); deltai += fabs(ndQ); break; } case TIP_USEL_BALANS: { //????????????? ???? ????? ?? ?????????????? (?) break; } case TIP_USEL_CONST: { double Qmax, Qmin; NebalansData& nodeData = it->second; if (ndQ >= nodeData.Qmax) nodeData.nodeType = TIP_USEL_QMAX; else if (ndQ <= nodeData.Qmin) nodeData.nodeType = TIP_USEL_QMIN; break; } case TIP_USEL_QMAX: { break; } case TIP_USEL_QMIN: { break; } default: deltai = INFINITE; //?????? ? ????? } i++; } deltai /= 2*N; return (deltai < geps); } EXPORT int CalcOnGPU(int N, Usl *st[]) { #pragma region init stuff int N1 = 0; //????? ???-?? ????????? ? ??????? st //cl_float normdU, normdDelta; cl_float *g_vals = (cl_float*) calloc(0, sizeof(cl_float)); cl_float *b_vals = (cl_float*) calloc(0, sizeof(cl_float)); cl_int *cols = (cl_int*) calloc(0, sizeof(cl_int)); /*?? ??????? 0 - ?????? 0 ?? ???????? 1..N-1 - ??????? ????????, ??? ?????????? ????????? ??? ?? ??????? N - ????? ???-?? ????????? ????? ??????? ??????????? ???-?? ????????? ??? i-????? ????? ????????? ???: edge[i+1] - edge; ?.?. ????????? ? edge ?? 1 ??????, ??? ?????? ? i-???? ???????? ????????? ???????? ?? vals ??????? ? edge ?????? */ cl_int *edges = new cl_int[N+1]; //?????? ??????? ?????????? cl_float *U = new cl_float; //?????? ??????? ????? cl_float *delta = new cl_float; cl_float *dU = new cl_float; cl_float *dDelta = new cl_float; //???????? ???????? ???????? cl_float *dP = new cl_float; //???????? ?????????? ???????? cl_float *dQ = new cl_float; cl_float *P = new cl_float; cl_float *Q = new cl_float; float* Ai; //????? ????? ? ??????? ??? ??????? ????????? std::map<int,NebalansData> nebalansData; Trdy *tmp; //????????? ????????? int _2N = 2 * N; for (int i = 0; i < N; i++) { //???? ??? ????????????? ???? if (st->Itipy == TIP_USEL_BALANS) { P = 0.0; Q = 0.0; nebalansData.insert(std::pair<int, NebalansData>(st->Nusl-1, NebalansData(st->Itipy, st->Qmax, st->Qmin, st->Umod))); } //???? ? ??????? ??????????? else if (st->Itipy == TIP_USEL_CONST) { P = -1 * st->Pg1; Q = 0; nebalansData.insert(std::pair<int, NebalansData>(st->Nusl-1, NebalansData(st->Itipy, st->Qmax, st->Qmin, st->Umod))); } //???? ??????? ???? else if (st->Itipy == TIP_USEL_NO) { P = st->Pn; Q = st->Qn; } else return ERR_DATA_PARSE_ERROR; edges = N1; U = st->V; delta = st->Delta; dDelta = st->DDelta; dU = st->DV; tmp = st->Nhp; while (tmp != NULL) { N1++; g_vals = (cl_float *) realloc(g_vals, N1 * sizeof(cl_float)); b_vals = (cl_float *) realloc(b_vals, N1 * sizeof(cl_float)); cols = (cl_int *) realloc(cols, N1 * sizeof(cl_int)); g_vals[N1-1] = tmp->Y.Re; b_vals[N1-1] = tmp->Y.Im; cols[N1-1] = tmp->Nqy-1; tmp = tmp->next; } } edges = N1; linkedlist * A = new linkedlist[_2N](); //????? ??????? ????? cl_float *dPdDelta = (cl_float *) calloc(N1, sizeof(cl_float)); cl_float *dPdU = (cl_float *) calloc(N1, sizeof(cl_float)); cl_float *dQdDelta = (cl_float *) calloc(N1, sizeof(cl_float)); cl_float *dQdU = (cl_float *) calloc(N1, sizeof(cl_float)); #pragma endregion #pragma region opencl prepare stuff cl_platform_id platform; clGetPlatformIDs(1, &platform, NULL); cl_device_id device; clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); cl_int res; cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &res); cl_command_queue queue = clCreateCommandQueue(context, device, NULL, &res); cl_program prog = clCreateProgramWithSource(context, 1, (const char **)&source, NULL, &res); if (res != 0) return ERR_CREATE_FROM_SOURCE; #pragma endregion res = clBuildProgram(prog, 1, &device, NULL, NULL, NULL); if (res != 0) return ERR_BUILD_PROGRAM; cl_kernel Nebalans = clCreateKernel(prog, "Nebalans", &res); if (res != 0) return ERR_CREATE_KERNEL; cl_kernel JacobiForm = clCreateKernel(prog, "JacobiForm", &res); if (res != 0) return ERR_CREATE_KERNEL; cl_kernel SAXPY = clCreateKernel(prog, "sAXPY_cond", &res); if (res != 0) return ERR_CREATE_KERNEL; cl_mem b_N, b_U, b_delta, b_dU, b_dDelta, b_g, b_b, b_cols, b_edges, b_P, b_Q, b_dP, b_dQ, b_dPdDelta, b_dPdU, b_dQdDelta, b_dQdU, b_a, b_x, b_xcols, b_y, b_k; int maxit = 2000; int iflag = 1, icount = 0; #pragma region "allocating memory for arguments..." b_N = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_int), NULL, &res); b_U = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(cl_float), NULL, &res); b_dU = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(cl_float), NULL, &res); b_delta = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(cl_float), NULL, &res); b_dDelta = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(cl_float), NULL, &res); b_g = clCreateBuffer(context, CL_MEM_READ_ONLY, N1 * sizeof(cl_float), NULL, &res); b_b = clCreateBuffer(context, CL_MEM_READ_ONLY, N1 * sizeof(cl_float), NULL, &res); b_cols = clCreateBuffer(context, CL_MEM_READ_ONLY, N1 * sizeof(cl_int), NULL, &res); b_edges = clCreateBuffer(context, CL_MEM_READ_ONLY, (N+1) * sizeof(cl_int), NULL, &res); b_P = clCreateBuffer(context, CL_MEM_READ_ONLY, N * sizeof(cl_float), NULL, &res); b_Q= clCreateBuffer(context, CL_MEM_READ_ONLY, N * sizeof(cl_float), NULL, &res); b_dP = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(cl_float), NULL, &res); b_dQ= clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(cl_float), NULL, &res); b_dPdDelta = clCreateBuffer(context, CL_MEM_READ_WRITE, N1 * sizeof(cl_float), NULL, &res); b_dPdU = clCreateBuffer(context, CL_MEM_READ_WRITE, N1 * sizeof(cl_float), NULL, &res); b_dQdDelta = clCreateBuffer(context, CL_MEM_READ_WRITE, N1 * sizeof(cl_float), NULL, &res); b_dQdU = clCreateBuffer(context, CL_MEM_READ_WRITE, N1 * sizeof(cl_float), NULL, &res); b_a = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_float), NULL, &res); b_y = clCreateBuffer(context, CL_MEM_READ_WRITE, _2N * sizeof(cl_float), NULL, &res); b_k = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_int), NULL, &res); #pragma endregion res = clSetKernelArg(Nebalans, 0, sizeof(cl_mem), (void*) &b_N); res|= clSetKernelArg(JacobiForm, 0, sizeof(cl_mem), (void*) &b_N); res|= clSetKernelArg(SAXPY, 0, sizeof(cl_mem), (void*) &b_N); res|= clEnqueueWriteBuffer(queue, b_N, CL_TRUE, 0,sizeof(cl_int), &N, NULL, NULL, NULL); #pragma region "Setting Nebalans arguments" res = clSetKernelArg(Nebalans, 1, sizeof(cl_mem), (void*) &b_U); res|= clSetKernelArg(Nebalans, 2, sizeof(cl_mem), (void*) &b_delta); res|= clSetKernelArg(Nebalans, 3, sizeof(cl_mem), (void*) &b_g); res|= clSetKernelArg(Nebalans, 4, sizeof(cl_mem), (void*) &b_b); res|= clSetKernelArg(Nebalans, 5, sizeof(cl_mem), (void*) &b_cols); res|= clSetKernelArg(Nebalans, 6, sizeof(cl_mem), (void*) &b_edges); res|= clSetKernelArg(Nebalans, 7, sizeof(cl_mem), (void*) &b_P); res|= clSetKernelArg(Nebalans, 8, sizeof(cl_mem), (void*) &b_Q); res|= clSetKernelArg(Nebalans, 9, sizeof(cl_mem), (void*) &b_dP); res|= clSetKernelArg(Nebalans, 10, sizeof(cl_mem), (void*) &b_dQ); #pragma endregion #pragma region "Setting JacobiForm arguments" res = clSetKernelArg(JacobiForm, 1, sizeof(cl_mem), (void*) &b_U); res|= clSetKernelArg(JacobiForm, 2, sizeof(cl_mem), (void*) &b_delta); res|= clSetKernelArg(JacobiForm, 3, sizeof(cl_mem), (void*) &b_g); res|= clSetKernelArg(JacobiForm, 4, sizeof(cl_mem), (void*) &b_b); res|= clSetKernelArg(JacobiForm, 5, sizeof(cl_mem), (void*) &b_cols); res|= clSetKernelArg(JacobiForm, 6, sizeof(cl_mem), (void*) &b_edges); res|= clSetKernelArg(JacobiForm, 7, sizeof(cl_mem), (void*) &b_dPdDelta); res|= clSetKernelArg(JacobiForm, 8, sizeof(cl_mem), (void*) &b_dPdU); res|= clSetKernelArg(JacobiForm, 9, sizeof(cl_mem), (void*) &b_dQdDelta); res|= clSetKernelArg(JacobiForm, 10, sizeof(cl_mem), (void*) &b_dQdU); #pragma endregion #pragma region "Setting Saxpy arguments" res|= clSetKernelArg(SAXPY, 1, sizeof(cl_mem), (void*) &b_a); res|= clSetKernelArg(SAXPY, 4, sizeof(cl_mem), (void*) &b_y); res|= clSetKernelArg(SAXPY, 5, sizeof(cl_mem), (void*) &b_k); #pragma endregion #pragma region "Copying data to device" res = clEnqueueWriteBuffer(queue, b_N, CL_TRUE, 0,sizeof(cl_int), &N, NULL, NULL, NULL); res|= clEnqueueWriteBuffer(queue, b_g, CL_TRUE, 0, N1 * sizeof(cl_float), g_vals, NULL, NULL, NULL); res|= clEnqueueWriteBuffer(queue, b_b, CL_TRUE, 0, N1 * sizeof(cl_float), b_vals, NULL, NULL, NULL); res|= clEnqueueWriteBuffer(queue, b_cols, CL_TRUE, 0, N1 * sizeof(cl_int), cols, NULL, NULL, NULL); res|= clEnqueueWriteBuffer(queue, b_edges, CL_TRUE, 0, (N+1) * sizeof(cl_int), edges, NULL, NULL, NULL); res|= clEnqueueWriteBuffer(queue, b_P, CL_TRUE, 0, N * sizeof(cl_float), P, 0, NULL, NULL); res|= clEnqueueWriteBuffer(queue, b_Q, CL_TRUE, 0, N * sizeof(cl_float), Q, 0, NULL, NULL); res|= clEnqueueWriteBuffer(queue, b_dPdDelta, CL_TRUE, 0, N1 * sizeof(cl_float), dPdDelta, 0, NULL, NULL); res|= clEnqueueWriteBuffer(queue, b_dQdDelta, CL_TRUE, 0, N1 * sizeof(cl_float), dQdDelta, 0, NULL, NULL); res|= clEnqueueWriteBuffer(queue, b_dPdU, CL_TRUE, 0, N1 * sizeof(cl_float), dPdU, 0, NULL, NULL); res|= clEnqueueWriteBuffer(queue, b_dQdU, CL_TRUE, 0, N1 * sizeof(cl_float), dQdU, 0, NULL, NULL); #pragma endregion size_t global_size[1];// = {N}; cl_event Nebalans_event, JacobiForm_event; int itercount = 0; //???????? ???? do{ ++itercount; #pragma region Nebalans stuff global_size[0] = N; res = clEnqueueWriteBuffer(queue, b_U, CL_TRUE, 0, N * sizeof(cl_float), U, NULL, NULL, NULL); res = clEnqueueWriteBuffer(queue, b_delta, CL_TRUE, 0, N * sizeof(cl_float), delta, NULL, NULL, NULL); res = clEnqueueNDRangeKernel( queue, Nebalans, 1, 0, global_size, NULL, 0, NULL, &Nebalans_event); clFinish(queue); #pragma endregion #pragma region JacobiForm stuff res = clEnqueueNDRangeKernel( queue, JacobiForm, 1, 0, global_size, NULL, 1, &Nebalans_event, &JacobiForm_event); res = clEnqueueReadBuffer(queue, b_dP, CL_TRUE, 0, N * sizeof(cl_float), dP,1, &Nebalans_event, NULL); res = clEnqueueReadBuffer(queue, b_dQ, CL_TRUE, 0, N * sizeof(cl_float), dQ,1, &Nebalans_event, NULL); res = clEnqueueReadBuffer(queue, b_dPdDelta, CL_TRUE, 0, N1 * sizeof(cl_float), dPdDelta, 1, &JacobiForm_event, NULL); res = clEnqueueReadBuffer(queue, b_dPdU, CL_TRUE, 0, N1 * sizeof(cl_float), dPdU, 1, &JacobiForm_event, NULL); res = clEnqueueReadBuffer(queue, b_dQdDelta, CL_TRUE, 0, N1 * sizeof(cl_float), dQdDelta, 1, &JacobiForm_event, NULL); res = clEnqueueReadBuffer(queue, b_dQdU, CL_TRUE, 0, N1 * sizeof(cl_float), dQdU, 1, &JacobiForm_event, NULL); int ii = 0, cnt = 0; A[ii].Clear(); A[ii + N].Clear(); #pragma endregion #pragma region "Removing generators and systems" /*!*/std::ofstream f1; /*!*/if (itercount == 1) /*!*/f1.clear(); /*!*/f1.open("f1", std::ios_base::app); /*!*/for (int i = 0; i < N; i++) /*!*/f1 << "dP[" << i <<"]=" << dP << "\ndQ["<<i<<"]="<<dQ<<"\n"; /*!*/f1 << "\n___\n"; /*!*/f1.close(); //??????? ??????? (map) std::map<int,NebalansData>::iterator it; for(int i=0; i < N1; i++) { //if system than all 0 it = nebalansData.find(ii); if (it != nebalansData.end()) { if (it->second.nodeType == TIP_USEL_BALANS) dPdU = dPdDelta = dQdU = dQdDelta = dP[ii] = dQ[ii] = 0.0; //if row of generator then if (it->second.nodeType == TIP_USEL_CONST) { dQdDelta = 0.0; dQdU = 0.0; dQ[ii] = 0.0; } } it = nebalansData.find(cols); if (it != nebalansData.end()) { if (it->second.nodeType == TIP_USEL_BALANS) dPdU = dPdDelta = dQdU = dQdDelta = 0.0; //if column of generator then if (it->second.nodeType == TIP_USEL_CONST) { dPdU = 0.0; dQdU = 0.0; } } A[ii].AppendItem(dPdDelta, cols); A[ii].AppendItem(dPdU, cols, N); A[ii+N].AppendItem(dQdDelta, cols); A[ii+N].AppendItem(dQdU, cols, N); cnt++; if ((edges[ii+1] - edges[ii]) == cnt){ ii++; if (ii < N) { A[ii].Clear(); A[ii + N].Clear(); } cnt = 0; } } #pragma endregion #pragma region "LU factorization" cl_float * tempVal; cl_int * tempCol; int pckdArrSize; float l, *aik, *akk; for (int i = 1; i < _2N; i++) { //if row is empty cpntinue if (A.count == 0) continue; for (int k = 0; k < i; k++) { /* ???? ??? ??? k-?? ???????????? ????????? ???????? ??????? - A - ??? ?? ???? ???? ?? ????????? - A - ??????? (????? ????? ???? ?????? ???? ????????? ?????????? (??????? ??? ?????????) ??? ??? ?????? ??????????? ?????, ?? ?? ???????, ??? ????? ??????? ?????????) ?? ?????????? ???????? ? ????????? ? ????????? */ aik = A; akk = A; //if at least 1 cell is NULL (see [] implementation) if (!(akk && aik)) continue; //skip iter //this is l[i,k] l = *aik / *akk; *aik = l; l *= -1; Ai = A.GetUnpackedArray(_2N); tempVal = new float[A.count](); tempCol = new int[A.count](); pckdArrSize = A.GetPackedArray(tempVal, tempCol); global_size[0] = pckdArrSize; b_x = clCreateBuffer(context, CL_MEM_READ_WRITE, pckdArrSize * sizeof(cl_float), NULL, &res); b_xcols = clCreateBuffer(context, CL_MEM_READ_WRITE, pckdArrSize * sizeof(cl_int), NULL, &res); //////alas,it can't be higher, since first - mem alloc, then - set kernel arg. No alternative (explored experimentally) res|= clSetKernelArg(SAXPY, 2, sizeof(cl_mem), (void*) &b_x); res|= clSetKernelArg(SAXPY, 3, sizeof(cl_mem), (void*) &b_xcols); //res|= clEnqueueWriteBuffer(queue, b_a, CL_TRUE, 0, sizeof(cl_float), &l, NULL, NULL, NULL); //res|= clEnqueueWriteBuffer(queue, b_x, CL_TRUE, 0, pckdArrSize * sizeof(cl_float), tempVal, NULL, NULL, NULL); //res|= clEnqueueWriteBuffer(queue, b_xcols, CL_TRUE, 0, pckdArrSize * sizeof(cl_int), tempCol, NULL, NULL, NULL); //res|= clEnqueueWriteBuffer(queue, b_y, CL_TRUE, 0, _2N * sizeof(cl_float), Ai, NULL, NULL, NULL); //res|= clEnqueueWriteBuffer(queue, b_k, CL_TRUE, 0, sizeof(cl_int), &k, NULL, NULL, NULL); //res|= clEnqueueNDRangeKernel( queue, SAXPY, 1, 0, global_size, NULL, 0, NULL, NULL); //clEnqueueReadBuffer(queue, b_y, CL_TRUE, 0, _2N * sizeof(cl_float), Ai, 0, NULL, NULL); clReleaseMemObject(b_x); clReleaseMemObject(b_xcols); A.PackArray(_2N, Ai); delete[] Ai; //memset(Ai, 0, _2N * sizeof(float)); //memset(Ak, 0, _2N * sizeof(float)); } } #pragma endregion float *y = new float[_2N](); //float *x = new float[_2N](); y[0] = dP[0]; linkedlistitem *t; for (int i = 1; i < _2N; i++) { y = (i < N) ? dP : dQ[i-N]; //no elements, no pain if (A.count == 0) continue; t = A.root; while (t) { //think about: column sort and trim all above i if (t->col < i) y -= t->value*y[t->col]; t = t->next; } } if (A[_2N-1](_2N-1) == 0) dQ[_2N-1] = 0; else dQ[_2N-1] = y[_2N-1] / A[_2N-1](_2N-1); float* p, aii, xi; //????????? ??? ?????? ??????? ?????????? for (int i = _2N-2; i >= 0; i--) { int idx; //?????? ??? ??????? ????????? //???????? ?????? ?????????? ? ??????? ????????? if (i < N) { p = dDelta; idx = i; } else { p = dU; idx = i - N; } t = A.root; while (t) { if (t->col > i) { if (t->col < N) xi = dDelta[t->col]; else xi = dU[t->col - N]; p[idx] += t->value * xi; } t = t->next; } aii = A(i); if (aii == 0) p[idx] = 0; else p[idx] = (y - p[idx]) / aii; } //???????????????? ??? ????????? ?????? ????? ???????? #pragma omp parallel for schedule (runtime) for (int i = 0; i < N; i++) { U -= dU; delta-= dDelta; } iflag = 1; icount = 0; }while (!NebalansCheck(N, dP, dQ, nebalansData) && itercount < 5); //????????? ????????? ? ?????????? ?????? ???????? (????? ????????????? ??? ?????????) #pragma omp parallel for schedule (runtime) for (int i = 0; i < N; i++) { st->V = U; st->Delta = delta; st->DV = dU; st->DDelta = dDelta; } delete[] Q; delete[] P; //??? ??????? clReleaseMemObject(b_N); clReleaseMemObject(b_U); clReleaseMemObject(b_delta); clReleaseMemObject(b_dU); clReleaseMemObject(b_dDelta); clReleaseMemObject(b_g); clReleaseMemObject(b_b); clReleaseMemObject(b_cols); clReleaseMemObject(b_edges); clReleaseMemObject(b_P); clReleaseMemObject(b_Q); clReleaseMemObject(b_dQ); clReleaseMemObject(b_dP); clReleaseMemObject(b_dPdDelta); clReleaseMemObject(b_dPdU); clReleaseMemObject(b_dQdDelta); clReleaseMemObject(b_dQdU); clReleaseProgram(prog); clReleaseKernel(Nebalans); clReleaseKernel(JacobiForm); clReleaseKernel(SAXPY); clReleaseCommandQueue(queue); clReleaseContext(context); delete[] edges; delete[] U; delete[] delta; delete[] dU; delete[] dDelta; delete[] dP; delete[] dQ; free(g_vals); free(b_vals); free(cols); free(dPdDelta); free(dPdU); free(dQdDelta); free(dQdU); return 0; }

nou · ‎01-18-2012

poor performance i comparsion to what. what is the difference? two times slower ten times?

dovalec · ‎01-23-2012

Hi

Please contact Yousef.Shajrawi@amd.com

Please send him your code (kernel and c++)

Dov Caspi

dov.caspi@amd.com

Archives Discussions

poor OpenCL performance