Addition of large vectors in OpenCL

Discussion created by marblecanyon on Nov 13, 2010
Latest reply on Nov 14, 2010 by nou
How to modify the code in order to be able to sum vectors with millions of elements?

First, I must say that I am a newcomer to the world of the OpenCL programming, so I hope that my question wouldn't be too difficult to answer. I'll go right to the point.

I was trying to write a simple OpenCL program that adds two vectors on the GPU, just like the ones you can find in many tutorials. The problem is the following: the program works perfectly fine for vectors that have less than 80 000 items, but a stack overflow happens for larger vectors. I guess the problem is with the global work size, but I'm not sure exactly what is wrong and how to correct it. I read in the OpenCL specification that the global work size should be able to go all the way to 2^^32-1, so I'm really not sure how to correct the program and make it work for larger arrays.

I am using ATI Stream SDK 2.2, MS Visual Studio 2010, Windows 7 64-bit OS and ATI Radeon 5650 GPU with 1GB of memory.

How should I modify the code in order to make it work for larger vectors?

Thank you in advance for your time and help!



#include "stdafx.h" #include <stdio.h> #include <iostream> #include <stdlib.h> #include <Windows.h> #include <time.h> #include <CL/cl.h> using namespace std; // OpenCL source code const char* OpenCLSource[] = { "__kernel void vectorAddition(const __global int* a,", " const __global int* b,", " __global int* c)", "{", " unsigned int gid = get_global_id(0);", " c[gid] = a[gid] + b[gid];", "}" }; // Number of elements in the vectors to be added #define N 1024 // Main function // ********************************************************************* int main(int argc, char **argv) { long ts, te; int inputVectorA[N], inputVectorB[N]; srand (time(NULL)); for(int i = 0; i < N; i++) { inputVectorA[i] = rand()%1000; inputVectorB[i] = rand()%1000; } //Platform cl_platform_id platform; clGetPlatformIDs (1, &platform, NULL); //Context cl_context_properties props[3]; props[0] = (cl_context_properties) CL_CONTEXT_PLATFORM; props[1] = (cl_context_properties) platform; props[2] = (cl_context_properties) 0; cl_context GPUContext = clCreateContextFromType(props, CL_DEVICE_TYPE_GPU,NULL, NULL, NULL); //Context info size_t ParmDataBytes; clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes); cl_device_id* GPUDevices = (cl_device_id*)malloc(ParmDataBytes); clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL); // Create command-queue cl_command_queue GPUCommandQueue = clCreateCommandQueue(GPUContext, GPUDevices[0], 0, NULL); // Allocate GPU memory for source vectors cl_mem GPUVector1 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * N, inputVectorA, NULL); cl_mem GPUVector2 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * N, inputVectorB, NULL); // Allocate GPU memory for output vector cl_mem GPUOutputVector = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY,sizeof(int) * N, NULL, NULL); // Create OpenCL program with source code cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 7, OpenCLSource, NULL, NULL); // Build the program (OpenCL JIT compilation) clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL); // Create a handle to the compiled OpenCL function cl_kernel OpenCLVectorAdd = clCreateKernel(OpenCLProgram, "vectorAddition", NULL); // Set kernel arguments clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem), (void*)&GPUVector1); clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&GPUVector2); clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&GPUOutputVector); // Launch the kernel and measure execution time size_t WorkSize[1] = {N}; // one dimensional Range ts=GetTickCount(); clEnqueueNDRangeKernel(GPUCommandQueue, OpenCLVectorAdd, 1, NULL, WorkSize, NULL, 0, NULL, NULL); te=GetTickCount(); // Copy the output back to CPU memory int outputVector[N]; clEnqueueReadBuffer(GPUCommandQueue, GPUOutputVector, CL_TRUE, 0, N * sizeof(int), outputVector, 0, NULL, NULL); // Cleanup free(GPUDevices); clReleaseKernel(OpenCLVectorAdd); clReleaseProgram(OpenCLProgram); clReleaseCommandQueue(GPUCommandQueue); clReleaseContext(GPUContext); clReleaseMemObject(GPUVector1); clReleaseMemObject(GPUVector2); clReleaseMemObject(GPUOutputVector); for(int i = 0; i < N; i++) { cout << inputVectorA[i] <<"+"<< inputVectorB[i] <<"=" <<outputVector[i] << "\n"; } cout << "Kernel execution time on the GPU is: " << te-ts << "ms!\n"; system ("pause"); return 0; }