Program runs on CPU but not GPU

Discussion created by ankurdh on Feb 19, 2010
Latest reply on Feb 23, 2010 by Fr4nz

hello, i recently wrote a program that will create a 100 kernels, each of which will try to match a pattern against one character stream given. 

The program runs when the device type is CPU. But when i make the device type GPU, it gives me garbage values. 

I've attached the code. Plz, can anyone figure out what is going wrong?

#include<iostream> #include<cstdlib> #define __NO_STD_VECTOR #define __NO_STD_STRING #include<SDKUtil/SDKCommon.hpp> #include<SDKUtil/SDKFile.hpp> #include<CL/cl.hpp> using namespace cl; int main(){ char * string = "This is a piece of text, that will be used as the source of the pattern to be searched.This one is supposed to be a very long text, in order to prove to the world that, a GPU shows its efficiency in terms of parallelism for very very huge amount of data. Though the performance of a normal CPU outrages the GPU performance when the data is small. For example, may be when we have to multiply n X n matrices, the CPU performance will be highly efficient until 'n' reaches, say, 300. But for matrices with a size 600X600, the performance of the GPU is certainly very very efficient. On an AMD's 4670 GPU, the multiplication of a 900X900 matrix will be 4 seconds faster than a normal CPU."; char * pattern = "CPU"; int strLen = strlen(string); int patternLen = strlen(pattern); int matchIndex = -1; //holds the index of the first char of the match. int runStartIndex = 0; //every run of kernels will start at a certain index, first run will be from 0. streamsdk::SDKCommon * utility = new streamsdk::SDKCommon(); int timer = utility->createTimer(); utility->resetTimer(timer); //setup openCL vector<Platform> myPlatforms; cl_int err = Platform::get(&myPlatforms); utility->checkVal(err,CL_SUCCESS,"Platform::get() failed"); cl_context_properties cntxtProps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(*myPlatforms.begin())(), 0 }; Context context(CL_DEVICE_TYPE_CPU,cntxtProps,NULL,NULL,&err); utility->checkVal(err,CL_SUCCESS,"Context failed."); vector<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>(&err); streamsdk::SDKFile kernelFile; if(!"subStringKernel.txt")){ std::cout<<"Kernel file could not be opened. Exiting. . . "<<std::endl; std::getchar(); std::exit(-1); } Program::Sources kernelSrc(1,std::make_pair(kernelFile.source().data(),kernelFile.source().size())); Program program(context,kernelSrc,&err); utility->checkVal(err,CL_SUCCESS,"Program failed."); err =,NULL,NULL,NULL); utility->checkVal(err,CL_SUCCESS,"Program build failed."); Kernel kernel(program,"subString",&err); utility->checkVal(err,CL_SUCCESS,"Kernel failed"); CommandQueue kernelQueue(context,*(devices.begin()),0,&err); utility->checkVal(err,CL_SUCCESS,"Command queue failed."); //std::cout<<"Text: "<<string<<std::endl; //std::cout<<"Pattern: "<<pattern<<std::endl; //allocate buffers Buffer stringBuffer(context,CL_MEM_USE_HOST_PTR|CL_MEM_READ_ONLY,strLen,(void *)string,&err); utility->checkVal(err,CL_SUCCESS,"String buffer could not be created.\n"); Buffer patternBuffer(context,CL_MEM_READ_ONLY|CL_MEM_USE_HOST_PTR,patternLen,(void *)pattern,&err); utility->checkVal(err,CL_SUCCESS,"Pattern buffer could not be created.\n"); Buffer strlenBuffer(context,CL_MEM_USE_HOST_PTR|CL_MEM_READ_ONLY,sizeof(int),(void *)&strLen,&err); utility->checkVal(err,CL_SUCCESS,"String length buffer could not be created.\n"); Buffer patternLengthBuffer(context,CL_MEM_USE_HOST_PTR|CL_MEM_READ_ONLY,sizeof(int),(void *)&patternLen,&err); utility->checkVal(err,CL_SUCCESS,"Pattern length buffer could not be created.\n"); Buffer matchIndexBuffer(context,CL_MEM_USE_HOST_PTR|CL_MEM_READ_WRITE,sizeof(int),(void *)&matchIndex,&err); utility->checkVal(err,CL_SUCCESS,"Matched index buffer could not be created.\n"); Buffer runStartIndexBuffer(context,CL_MEM_USE_HOST_PTR|CL_MEM_READ_WRITE,sizeof(int),(void *)&runStartIndex,&err); utility->checkVal(err,CL_SUCCESS,"Could not create start index buffer.\n"); //copy program data into the buffers. err = kernelQueue.enqueueWriteBuffer(stringBuffer,CL_TRUE,NULL,strLen,(void *)string,NULL,NULL); utility->checkVal(err,CL_SUCCESS,"Could not write into String buffer.\n"); err = kernelQueue.enqueueWriteBuffer(patternBuffer,CL_TRUE,NULL,patternLen,(void *)pattern,NULL,NULL); utility->checkVal(err,CL_SUCCESS,"Could not write into pattern buffer.\n"); err = kernelQueue.enqueueWriteBuffer(strlenBuffer,CL_TRUE,NULL,sizeof(int),(void *)&strLen,NULL,NULL); utility->checkVal(err,CL_SUCCESS,"Could not write into String length buffer.\n"); err = kernelQueue.enqueueWriteBuffer(patternLengthBuffer,CL_TRUE,NULL,sizeof(int),(void *)&patternLen,NULL,NULL); utility->checkVal(err,CL_SUCCESS,"Could not write into pattern length buffer.\n"); err = kernelQueue.enqueueWriteBuffer(matchIndexBuffer,CL_TRUE,NULL,sizeof(int),(void *)&matchIndex,NULL,NULL); utility->checkVal(err,CL_SUCCESS,"Could not write into match index buffer.\n"); err = kernelQueue.enqueueWriteBuffer(runStartIndexBuffer,CL_TRUE,NULL,sizeof(int),(void *)&runStartIndex,NULL,NULL); utility->checkVal(err,CL_SUCCESS,"Could not copy into start index buffer.\n"); //there will be one work item with 100 kernels in it arranged as a 10X10 array. NDRange globalKernels(10,10); NDRange localKernels(10,10); int run = 1; utility->startTimer(timer); while(matchIndex < 0 && runStartIndex <= strLen-patternLen){ //set kernel arguements. err = kernel.setArg(0,stringBuffer); utility->checkVal(err,CL_SUCCESS,"Could not set arg 1.\n"); err = kernel.setArg(1,patternBuffer); utility->checkVal(err,CL_SUCCESS,"Could not set arg 2.\n"); err = kernel.setArg(2,strlenBuffer); utility->checkVal(err,CL_SUCCESS,"Could not set arg 3.\n"); err = kernel.setArg(3,patternLengthBuffer); utility->checkVal(err,CL_SUCCESS,"Could not set arg 4.\n"); err = kernel.setArg(4,matchIndexBuffer); utility->checkVal(err,CL_SUCCESS,"Could not set arg 5.\n"); err = kernel.setArg(5,runStartIndexBuffer); utility->checkVal(err,CL_SUCCESS,"Could not set arg 6.\n"); err = kernelQueue.enqueueNDRangeKernel(kernel,NullRange,globalKernels,localKernels,NULL,NULL); utility->checkVal(err,CL_SUCCESS,"Could not run Kernels.\n"); err = kernelQueue.enqueueReadBuffer(matchIndexBuffer,CL_TRUE,NULL,sizeof(int),(void *)&matchIndex,NULL,NULL); utility->checkVal(err,CL_SUCCESS,"Could not read from match index buffer."); std::cout<<"\nRun: "<<run++<<"\nMatched index: "<<matchIndex<<std::endl; runStartIndex += 100; } utility->stopTimer(timer); std::cout<<"Match Index: "<<matchIndex<<std::endl; for(int i = 0 ; i < patternLen; i ++) std::cout<<string[matchIndex++]; std::cout<<"\nTime taken to search with openCL: "<<utility->readTimer(timer)<<"s"<<std::endl; std::cout<<"\n\nSuccess.\nPress any key to exit."; std::getchar(); } //the following is the kernel. __kernel void subString(__global char * string, __global char * pattern, __global int * stringLength, __global int * patternLength, __global int * matchIndex, __global int * runStartIndex) { int globalX = get_global_id(0); int globalY = get_global_id(1); int kernelStartIndex = (globalX * 10 + globalY) + (*runStartIndex); if(kernelStartIndex > (*stringLength)){ return; } int noOfMatchedChars = 0; while(*pattern == string[kernelStartIndex] && kernelStartIndex < (*stringLength)){ pattern++; kernelStartIndex++; noOfMatchedChars++; } if(noOfMatchedChars == (*patternLength)){ *matchIndex = (globalX * 10 + globalY) + (*runStartIndex); }else{ return; } }