Archives Discussions

dinaharchery · ‎09-21-2009

Simple Newbie Question

I know this is a simple question but I can't seem to find info about it, can someone please help?

I am trying to compile a cpp file that will reference a compiled brook file. The problem is the data I am sending the compiled Brook files (i.e., the CPP and Header files). The code is attached.

The compiler prints out the following (where Main.cpp is the main file):

1>.\Main.cpp(12) : error C2143: syntax error : missing ';' before '<'

#include <brook/stream.h> using namespace brook; int main(){ int m = 3; int n = 3; int p = 2; float S_m1<m, n>; return 0; }

Ceq · ‎09-21-2009

You are trying to use templates in a wrong way, I think you were trying to write something like this:

#include <brook/stream.h>

using namespace brook;

int main(){
    int m = 3;
    int n = 3;
    int p = 2;

    float data[] = { 1, 2, 3, 1, 2, 3, 1, 2, 3};
    ...

    unsigned int dim[] = {m, n, 1, 1};
    Stream<float> S_m1(2, dim);

    S_m1.read(data);

    ...

    return 0;
}

dinaharchery · ‎09-22-2009

Thanks for the reply

So the code examples in the samples for Brook+ are not valid? I got the idea from the "simple_matmult.br" in $(BROOKROOT)/samples/legacy/apps/simple_matmult/

I attached the relevant code from the sample. Is there a switch that I am missing or header file?

Thanks again.

///////////////////////////////////////////////////////////////////////// // Brook code block ///////////////////////////////////////////////////////////////////////// { float A<Height, Width>; float B<Width, Height>; float C<Height, Height>; // Record GPU Total time Start(0); for (i = 0; i < cmd.Iterations; ++i) { // Write to stream streamRead(A, inputA); streamRead(B, inputB); // Run the brook program simple_matmult((float)Width, A, B, C); // Write data back from stream streamWrite(C, output); } Stop(0); }

gaurav_garg · ‎09-22-2009

This syntax in an extension to C in Brook+ language and hence it must be compiled using brcc and part of .br file.

In case, you want to use the same thing in C++ file, you need to change your code as mentioned by Ceq.

dinaharchery · ‎09-22-2009

Thank you.

Using the above advice I have created a simple matrix-matrix multiplication code (derived from various sources, some from this forum), but I am getting odd results when comparing to CPU version. Can anyone help? I want to do the Matrix Multiplication without any sort of reduction, since I found this to be a big performance issue.

I have attached all relevent code (sorry for length), where the kernel code is compiled from a brook file and linked to the C++ code.

Input A Matrix:

1 2 3

4 5 6

7 8 9

Input B Matrix:

1 2

3 4

5 6

CPU-Based Results (correct - checked with calculator):

22 28

49 64

76 100

GPU-Based Results:

6 6

1.311e+017 -3.86e-034

#include "common.h" #include "Timer.h" kernel void transposeGPU(float i[][], out float o<>) { // Get the (x,y) position of o in (index.x, index.y) // instance().x is column and instance().y is row int2 index = instance().xy; // Fetch a value from (y,x) // C - style indexing. index.x is row and index.y is column number o = i[index.x][index.y]; } reduce void kernel_reduce(float x<>, reduce float result<>) { result += x; } kernel void kernel_mul(float a<>, float b<>, out float c<>) { c = a * b; } kernel void kernel_copy(float a<>, out float b<>) { b = a; } #include <cstdio> #include <tchar.h> #include <iostream> #include <string> #include <fstream> #include <exception> #include <vector> #include <sstream> #include <math.h> #include <limits> #include <iomanip> #include <time.h> #include <conio.h> // Brook+ libraries: #include <brook/stream.h> // Brook+ user functions (files generated by // Brook+ compiler): #include "Timer.h" #include "common.h" #include "GPUMatrixVector.h" // Brook Namespace Declaration: using namespace brook; using namespace std; int main() { unsigned int m = 3; unsigned int n = 3; unsigned int p = 2; unsigned int i, j, k, cnt; double cpuTime = 0.0, gpuTime = 0.0; // Stream Sizes: unsigned int mat1Size = m*n; unsigned int mat2Size = n*p; unsigned int resultSize = m*p; // GPU System Memory: Stream<float> S_m1(1, &mat1Size); Stream<float> S_m2(1, &mat2Size); Stream<float> S_trans_m1(1, &mat1Size); Stream<float> S_trans_m2(1, &mat2Size); Stream<float> S_temp(1, &mat1Size); Stream<float> S_tempreduce(1, &m); Stream<float> S_result(1, &resultSize); Stream<float> S_realresult(1, &resultSize); // CPU System Memory: float *m1input; float *m2input; float *resultOutput; float **mat1 = 0; float **mat2 = 0; float **matr = 0; m1input = new float[mat1Size]; for(i = 0; i < mat1Size; i++) m1input = 0.0f; m2input = new float[mat2Size]; for(i = 0; i < mat2Size; i++) m2input = 0.0f; resultOutput = new float[resultSize]; for(i = 0; i < resultSize; i++) resultOutput = 0.0f; mat1 = new float*; for(i = 0 ; i < m; i++) mat1 = new float; mat2 = new float*; for(i = 0; i < n; i++) mat2 = new float
; matr = new float*; for(i = 0; i < m; i++) matr = new float
; // Load Input Matrices with Data: cnt = 0; for(i = 0; i < m; i++){ for(j = 0; j < n; j++){ m1input[m*i + j] = (float)(cnt + 1); mat1 = (float)(cnt + 1); cnt++; } } cnt = 0; for(i = 0; i < n; i++){ for(j = 0; j < p; j++){ m2input[n*i + j] = (float)(cnt + 1); mat2 = (float)(cnt + 1); cnt++; } } ///////////////////////////////////////////////////////////////////////// // Setup the timers // // 0 = GPU Total Time // // 1 = CPU Total Time // ///////////////////////////////////////////////////////////////////////// Setup(0); Setup(1); // CPU->GPU: S_m1.read(m1input); S_m2.read(m2input); ////////////////////////////////////////////////////////////////// // GPU Matrix Multiplication - START: // ////////////////////////////////////////////////////////////////// Start(0); transposeGPU(S_m1, S_trans_m1); for(i = 0; i < p; i++){ j = i + 1; kernel_mul(S_trans_m1, S_m2.domain(int2(i, 0), int2(j, n)), S_temp); kernel_reduce(S_temp, S_tempreduce); kernel_copy(S_tempreduce, S_result.domain(int2(0, i), int2(m, j))); } transposeGPU(S_result, S_realresult); Stop(0); gpuTime = GetElapsedTime(0); ////////////////////////////////////////////////////////////////// // GPU Matrix Multiplication - END // ////////////////////////////////////////////////////////////////// // GPU->CPU: S_realresult.write(resultOutput); ////////////////////////////////////////////////////////////////// // CPU Matrix Multiplication - START: // ////////////////////////////////////////////////////////////////// Start(1); for(i = 0; i < m; i++){ for(j = 0; j < p; j++){ matr = 0.0f; for(k = 0; k < n; k++) matr = matr + mat1 * mat2; } } Stop(1); cpuTime = GetElapsedTime(1); ////////////////////////////////////////////////////////////////// // CPU Matrix Multiplication - END // ////////////////////////////////////////////////////////////////// // Display Results: cout << "\nTotal GPU Time: " << gpuTime << endl; cout << "\nTotal CPU Time: " << cpuTime << endl; cout << "\n\nGPU Final Result:\n"; cout << "---------------------------------\n\n"; for(i = 0; i < m; i++){ for(j = 0; j < p; j++) cout << resultOutput[m*i + j] << "\t"; cout << endl; } cout << "\n\nCPU Final Result:\n"; cout << "---------------------------------\n\n"; for(i = 0; i < m; i++){ for(j = 0; j < p; j++) cout << matr << "\t"; cout << endl; } delete [] resultOutput; delete [] m1input; delete [] m2input; for(i = 0; i < m; i++) delete [] mat1; delete [] mat1; for(i = 0; i < n; i++) delete [] mat2; delete [] mat2; for(i = 0; i < m; i++) delete [] matr; delete [] matr; return 0; }

Archives Discussions

StreamRead and StreamWrite Question