cancel
Showing results for 
Search instead for 
Did you mean: 

Archives Discussions

ankurdh
Journeyman III

Program runs on CPU but not GPU

hello, i recently wrote a program that will create a 100 kernels, each of which will try to match a pattern against one character stream given. 

The program runs when the device type is CPU. But when i make the device type GPU, it gives me garbage values. 

I've attached the code. Plz, can anyone figure out what is going wrong?

#include<iostream> #include<cstdlib> #define __NO_STD_VECTOR #define __NO_STD_STRING #include<SDKUtil/SDKCommon.hpp> #include<SDKUtil/SDKFile.hpp> #include<CL/cl.hpp> using namespace cl; int main(){ char * string = "This is a piece of text, that will be used as the source of the pattern to be searched.This one is supposed to be a very long text, in order to prove to the world that, a GPU shows its efficiency in terms of parallelism for very very huge amount of data. Though the performance of a normal CPU outrages the GPU performance when the data is small. For example, may be when we have to multiply n X n matrices, the CPU performance will be highly efficient until 'n' reaches, say, 300. But for matrices with a size 600X600, the performance of the GPU is certainly very very efficient. On an AMD's 4670 GPU, the multiplication of a 900X900 matrix will be 4 seconds faster than a normal CPU."; char * pattern = "CPU"; int strLen = strlen(string); int patternLen = strlen(pattern); int matchIndex = -1; //holds the index of the first char of the match. int runStartIndex = 0; //every run of kernels will start at a certain index, first run will be from 0. streamsdk::SDKCommon * utility = new streamsdk::SDKCommon(); int timer = utility->createTimer(); utility->resetTimer(timer); //setup openCL vector<Platform> myPlatforms; cl_int err = Platform::get(&myPlatforms); utility->checkVal(err,CL_SUCCESS,"Platform::get() failed"); cl_context_properties cntxtProps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(*myPlatforms.begin())(), 0 }; Context context(CL_DEVICE_TYPE_CPU,cntxtProps,NULL,NULL,&err); utility->checkVal(err,CL_SUCCESS,"Context failed."); vector<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>(&err); streamsdk::SDKFile kernelFile; if(!kernelFile.open("subStringKernel.txt")){ std::cout<<"Kernel file could not be opened. Exiting. . . "<<std::endl; std::getchar(); std::exit(-1); } Program::Sources kernelSrc(1,std::make_pair(kernelFile.source().data(),kernelFile.source().size())); Program program(context,kernelSrc,&err); utility->checkVal(err,CL_SUCCESS,"Program failed."); err = program.build(devices,NULL,NULL,NULL); utility->checkVal(err,CL_SUCCESS,"Program build failed."); Kernel kernel(program,"subString",&err); utility->checkVal(err,CL_SUCCESS,"Kernel failed"); CommandQueue kernelQueue(context,*(devices.begin()),0,&err); utility->checkVal(err,CL_SUCCESS,"Command queue failed."); //std::cout<<"Text: "<<string<<std::endl; //std::cout<<"Pattern: "<<pattern<<std::endl; //allocate buffers Buffer stringBuffer(context,CL_MEM_USE_HOST_PTR|CL_MEM_READ_ONLY,strLen,(void *)string,&err); utility->checkVal(err,CL_SUCCESS,"String buffer could not be created.\n"); Buffer patternBuffer(context,CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR,patternLen,(void *)pattern,&err); utility->checkVal(err,CL_SUCCESS,"Pattern buffer could not be created.\n"); Buffer strlenBuffer(context,CL_MEM_USE_HOST_PTR|CL_MEM_READ_ONLY,sizeof(int),(void *)&strLen,&err); utility->checkVal(err,CL_SUCCESS,"String length buffer could not be created.\n"); Buffer patternLengthBuffer(context,CL_MEM_USE_HOST_PTR|CL_MEM_READ_ONLY,sizeof(int),(void *)&patternLen,&err); utility->checkVal(err,CL_SUCCESS,"Pattern length buffer could not be created.\n"); Buffer matchIndexBuffer(context,CL_MEM_USE_HOST_PTR|CL_MEM_READ_WRITE,sizeof(int),(void *)&matchIndex,&err); utility->checkVal(err,CL_SUCCESS,"Matched index buffer could not be created.\n"); Buffer runStartIndexBuffer(context,CL_MEM_USE_HOST_PTR|CL_MEM_READ_WRITE,sizeof(int),(void *)&runStartIndex,&err); utility->checkVal(err,CL_SUCCESS,"Could not create start index buffer.\n"); //copy program data into the buffers. err = kernelQueue.enqueueWriteBuffer(stringBuffer,CL_TRUE,NULL,strLen,(void *)string,NULL,NULL); utility->checkVal(err,CL_SUCCESS,"Could not write into String buffer.\n"); err = kernelQueue.enqueueWriteBuffer(patternBuffer,CL_TRUE,NULL,patternLen,(void *)pattern,NULL,NULL); utility->checkVal(err,CL_SUCCESS,"Could not write into pattern buffer.\n"); err = kernelQueue.enqueueWriteBuffer(strlenBuffer,CL_TRUE,NULL,sizeof(int),(void *)&strLen,NULL,NULL); utility->checkVal(err,CL_SUCCESS,"Could not write into String length buffer.\n"); err = kernelQueue.enqueueWriteBuffer(patternLengthBuffer,CL_TRUE,NULL,sizeof(int),(void *)&patternLen,NULL,NULL); utility->checkVal(err,CL_SUCCESS,"Could not write into pattern length buffer.\n"); err = kernelQueue.enqueueWriteBuffer(matchIndexBuffer,CL_TRUE,NULL,sizeof(int),(void *)&matchIndex,NULL,NULL); utility->checkVal(err,CL_SUCCESS,"Could not write into match index buffer.\n"); err = kernelQueue.enqueueWriteBuffer(runStartIndexBuffer,CL_TRUE,NULL,sizeof(int),(void *)&runStartIndex,NULL,NULL); utility->checkVal(err,CL_SUCCESS,"Could not copy into start index buffer.\n"); //there will be one work item with 100 kernels in it arranged as a 10X10 array. NDRange globalKernels(10,10); NDRange localKernels(10,10); int run = 1; utility->startTimer(timer); while(matchIndex < 0 && runStartIndex <= strLen-patternLen){ //set kernel arguements. err = kernel.setArg(0,stringBuffer); utility->checkVal(err,CL_SUCCESS,"Could not set arg 1.\n"); err = kernel.setArg(1,patternBuffer); utility->checkVal(err,CL_SUCCESS,"Could not set arg 2.\n"); err = kernel.setArg(2,strlenBuffer); utility->checkVal(err,CL_SUCCESS,"Could not set arg 3.\n"); err = kernel.setArg(3,patternLengthBuffer); utility->checkVal(err,CL_SUCCESS,"Could not set arg 4.\n"); err = kernel.setArg(4,matchIndexBuffer); utility->checkVal(err,CL_SUCCESS,"Could not set arg 5.\n"); err = kernel.setArg(5,runStartIndexBuffer); utility->checkVal(err,CL_SUCCESS,"Could not set arg 6.\n"); err = kernelQueue.enqueueNDRangeKernel(kernel,NullRange,globalKernels,localKernels,NULL,NULL); utility->checkVal(err,CL_SUCCESS,"Could not run Kernels.\n"); err = kernelQueue.enqueueReadBuffer(matchIndexBuffer,CL_TRUE,NULL,sizeof(int),(void *)&matchIndex,NULL,NULL); utility->checkVal(err,CL_SUCCESS,"Could not read from match index buffer."); std::cout<<"\nRun: "<<run++<<"\nMatched index: "<<matchIndex<<std::endl; runStartIndex += 100; } utility->stopTimer(timer); std::cout<<"Match Index: "<<matchIndex<<std::endl; for(int i = 0 ; i < patternLen; i ++) std::cout<<string[matchIndex++]; std::cout<<"\nTime taken to search with openCL: "<<utility->readTimer(timer)<<"s"<<std::endl; std::cout<<"\n\nSuccess.\nPress any key to exit."; std::getchar(); } //the following is the kernel. __kernel void subString(__global char * string, __global char * pattern, __global int * stringLength, __global int * patternLength, __global int * matchIndex, __global int * runStartIndex) { int globalX = get_global_id(0); int globalY = get_global_id(1); int kernelStartIndex = (globalX * 10 + globalY) + (*runStartIndex); if(kernelStartIndex > (*stringLength)){ return; } int noOfMatchedChars = 0; while(*pattern == string[kernelStartIndex] && kernelStartIndex < (*stringLength)){ pattern++; kernelStartIndex++; noOfMatchedChars++; } if(noOfMatchedChars == (*patternLength)){ *matchIndex = (globalX * 10 + globalY) + (*runStartIndex); }else{ return; } }

0 Likes
2 Replies
n0thing
Journeyman III

You have a race condition in your kernel where all the threads write to a single memory location 'matchIndex'. Scheduling of threads is different on GPU and CPU hence you see different results. You should change your algorithm so that each thread writes to its own location, you can also use atomic operations so that multiple threads can write to a same location without getting arbitrary values.
0 Likes

Originally posted by: n0thing You have a race condition in your kernel where all the threads write to a single memory location 'matchIndex'. Scheduling of threads is different on GPU and CPU hence you see different results. You should change your algorithm so that each thread writes to its own location, you can also use atomic operations so that multiple threads can write to a same location without getting arbitrary values.


A post scriptum regarding what n0thing said: atomic writes degrade kernel performance. So the best solution would be to make every thread write it's own memory location.

0 Likes