4 Replies Latest reply on Sep 22, 2009 12:01 PM by dinaharchery

    StreamRead and StreamWrite Question

    dinaharchery
      Simple Newbie Question

      I know this is a simple question but I can't seem to find info about it, can someone please help?

      I am trying to compile a cpp file that will reference a compiled brook file. The problem is the data I am sending the compiled Brook files (i.e., the CPP and Header files). The code is attached.

      The compiler prints out the following (where Main.cpp is the main file):

       

      1>.\Main.cpp(12) : error C2143: syntax error : missing ';' before '<'



      #include <brook/stream.h> using namespace brook; int main(){ int m = 3; int n = 3; int p = 2; float S_m1<m, n>; return 0; }

        • StreamRead and StreamWrite Question
          Ceq

          You are trying to use templates in a wrong way, I think you were trying to write something like this:

          #include <brook/stream.h>

          using namespace brook;

          int main(){
              int m = 3;
              int n = 3;
              int p = 2;
             
              float data[] = { 1, 2, 3, 1, 2, 3, 1, 2, 3};
              ...
             
              unsigned int dim[] = {m, n, 1, 1};
              Stream<float> S_m1(2, dim);
             
              S_m1.read(data);
             
              ...
            
              return 0;
          }

            • StreamRead and StreamWrite Question
              dinaharchery

              Thanks for the reply

              So the code examples in the samples for Brook+ are not valid? I got the idea from the "simple_matmult.br" in $(BROOKROOT)/samples/legacy/apps/simple_matmult/

              I attached the relevant code from the sample. Is there a switch that I am missing or header file?

              Thanks again.

              ///////////////////////////////////////////////////////////////////////// // Brook code block ///////////////////////////////////////////////////////////////////////// { float A<Height, Width>; float B<Width, Height>; float C<Height, Height>; // Record GPU Total time Start(0); for (i = 0; i < cmd.Iterations; ++i) { // Write to stream streamRead(A, inputA); streamRead(B, inputB); // Run the brook program simple_matmult((float)Width, A, B, C); // Write data back from stream streamWrite(C, output); } Stop(0); }

                • StreamRead and StreamWrite Question
                  gaurav.garg

                  This syntax in an extension to C in Brook+ language and hence it must be compiled using brcc and part of .br file.

                  In case, you want to use the same thing in C++ file, you need to change your code as mentioned by Ceq.

                    • StreamRead and StreamWrite Question
                      dinaharchery

                      Thank you.

                      Using the above advice I have created a simple matrix-matrix multiplication code (derived from various sources, some from this forum), but I am getting odd results when comparing to CPU version. Can anyone help?  I want to do the Matrix Multiplication without any sort of reduction, since I found this to be a big performance issue.

                      I have attached all relevent code (sorry for length), where the kernel code is compiled from a brook file and linked to the C++ code.

                      Input A Matrix:

                      1  2  3

                      4  5  6

                      7  8  9

                      Input B Matrix:

                      1  2

                      3  4

                      5  6

                      CPU-Based Results (correct - checked with calculator):

                      22  28

                      49  64

                      76  100

                      GPU-Based Results:

                      6                  6

                      6                  6

                      1.311e+017  -3.86e-034

                      #include "common.h" #include "Timer.h" kernel void transposeGPU(float i[][], out float o<>) { // Get the (x,y) position of o in (index.x, index.y) // instance().x is column and instance().y is row int2 index = instance().xy; // Fetch a value from (y,x) // C - style indexing. index.x is row and index.y is column number o = i[index.x][index.y]; } reduce void kernel_reduce(float x<>, reduce float result<>) { result += x; } kernel void kernel_mul(float a<>, float b<>, out float c<>) { c = a * b; } kernel void kernel_copy(float a<>, out float b<>) { b = a; } #include <cstdio> #include <tchar.h> #include <iostream> #include <string> #include <fstream> #include <exception> #include <vector> #include <sstream> #include <math.h> #include <limits> #include <iomanip> #include <time.h> #include <conio.h> // Brook+ libraries: #include <brook/stream.h> // Brook+ user functions (files generated by // Brook+ compiler): #include "Timer.h" #include "common.h" #include "GPUMatrixVector.h" // Brook Namespace Declaration: using namespace brook; using namespace std; int main() { unsigned int m = 3; unsigned int n = 3; unsigned int p = 2; unsigned int i, j, k, cnt; double cpuTime = 0.0, gpuTime = 0.0; // Stream Sizes: unsigned int mat1Size = m*n; unsigned int mat2Size = n*p; unsigned int resultSize = m*p; // GPU System Memory: Stream<float> S_m1(1, &mat1Size); Stream<float> S_m2(1, &mat2Size); Stream<float> S_trans_m1(1, &mat1Size); Stream<float> S_trans_m2(1, &mat2Size); Stream<float> S_temp(1, &mat1Size); Stream<float> S_tempreduce(1, &m); Stream<float> S_result(1, &resultSize); Stream<float> S_realresult(1, &resultSize); // CPU System Memory: float *m1input; float *m2input; float *resultOutput; float **mat1 = 0; float **mat2 = 0; float **matr = 0; m1input = new float[mat1Size]; for(i = 0; i < mat1Size; i++) m1input[i] = 0.0f; m2input = new float[mat2Size]; for(i = 0; i < mat2Size; i++) m2input[i] = 0.0f; resultOutput = new float[resultSize]; for(i = 0; i < resultSize; i++) resultOutput[i] = 0.0f; mat1 = new float*[m]; for(i = 0 ; i < m; i++) mat1[i] = new float[n]; mat2 = new float*[n]; for(i = 0; i < n; i++) mat2[i] = new float[p]; matr = new float*[m]; for(i = 0; i < m; i++) matr[i] = new float[p]; // Load Input Matrices with Data: cnt = 0; for(i = 0; i < m; i++){ for(j = 0; j < n; j++){ m1input[m*i + j] = (float)(cnt + 1); mat1[i][j] = (float)(cnt + 1); cnt++; } } cnt = 0; for(i = 0; i < n; i++){ for(j = 0; j < p; j++){ m2input[n*i + j] = (float)(cnt + 1); mat2[i][j] = (float)(cnt + 1); cnt++; } } ///////////////////////////////////////////////////////////////////////// // Setup the timers // // 0 = GPU Total Time // // 1 = CPU Total Time // ///////////////////////////////////////////////////////////////////////// Setup(0); Setup(1); // CPU->GPU: S_m1.read(m1input); S_m2.read(m2input); ////////////////////////////////////////////////////////////////// // GPU Matrix Multiplication - START: // ////////////////////////////////////////////////////////////////// Start(0); transposeGPU(S_m1, S_trans_m1); for(i = 0; i < p; i++){ j = i + 1; kernel_mul(S_trans_m1, S_m2.domain(int2(i, 0), int2(j, n)), S_temp); kernel_reduce(S_temp, S_tempreduce); kernel_copy(S_tempreduce, S_result.domain(int2(0, i), int2(m, j))); } transposeGPU(S_result, S_realresult); Stop(0); gpuTime = GetElapsedTime(0); ////////////////////////////////////////////////////////////////// // GPU Matrix Multiplication - END // ////////////////////////////////////////////////////////////////// // GPU->CPU: S_realresult.write(resultOutput); ////////////////////////////////////////////////////////////////// // CPU Matrix Multiplication - START: // ////////////////////////////////////////////////////////////////// Start(1); for(i = 0; i < m; i++){ for(j = 0; j < p; j++){ matr[i][j] = 0.0f; for(k = 0; k < n; k++) matr[i][j] = matr[i][j] + mat1[i][k] * mat2[k][j]; } } Stop(1); cpuTime = GetElapsedTime(1); ////////////////////////////////////////////////////////////////// // CPU Matrix Multiplication - END // ////////////////////////////////////////////////////////////////// // Display Results: cout << "\nTotal GPU Time: " << gpuTime << endl; cout << "\nTotal CPU Time: " << cpuTime << endl; cout << "\n\nGPU Final Result:\n"; cout << "---------------------------------\n\n"; for(i = 0; i < m; i++){ for(j = 0; j < p; j++) cout << resultOutput[m*i + j] << "\t"; cout << endl; } cout << "\n\nCPU Final Result:\n"; cout << "---------------------------------\n\n"; for(i = 0; i < m; i++){ for(j = 0; j < p; j++) cout << matr[i][j] << "\t"; cout << endl; } delete [] resultOutput; delete [] m1input; delete [] m2input; for(i = 0; i < m; i++) delete [] mat1[i]; delete [] mat1; for(i = 0; i < n; i++) delete [] mat2[i]; delete [] mat2; for(i = 0; i < m; i++) delete [] matr[i]; delete [] matr; return 0; }