Addition of large vectors in OpenCL

How to modify the code in order to be able to sum vectors with millions of elements?

First, I must say that I am a newcomer to the world of the OpenCL programming, so I hope that my question wouldn't be too difficult to answer. I'll go right to the point.

I was trying to write a simple OpenCL program that adds two vectors on the GPU, just like the ones you can find in many tutorials. The problem is the following: the program works perfectly fine for vectors that have less than 80 000 items, but a stack overflow happens for larger vectors. I guess the problem is with the global work size, but I'm not sure exactly what is wrong and how to correct it. I read in the OpenCL specification that the global work size should be able to go all the way to 2^^32-1, so I'm really not sure how to correct the program and make it work for larger arrays.

I am using ATI Stream SDK 2.2, MS Visual Studio 2010, Windows 7 64-bit OS and ATI Radeon 5650 GPU with 1GB of memory.

How should I modify the code in order to make it work for larger vectors?

Thank you in advance for your time and help!



#include "stdafx.h" #include <stdio.h> #include <iostream> #include <stdlib.h> #include <Windows.h> #include <time.h> #include <CL/cl.h> using namespace std; // OpenCL source code const char* OpenCLSource[] = { "__kernel void vectorAddition(const __global int* a,", " const __global int* b,", " __global int* c)", "{", " unsigned int gid = get_global_id(0);", " c[gid] = a[gid] + b[gid];", "}" }; // Number of elements in the vectors to be added #define N 1024 // Main function // ********************************************************************* int main(int argc, char **argv) { long ts, te; int inputVectorA[N], inputVectorB[N]; srand (time(NULL)); for(int i = 0; i < N; i++) { inputVectorA[i] = rand()%1000; inputVectorB[i] = rand()%1000; } //Platform cl_platform_id platform; clGetPlatformIDs (1, &platform, NULL); //Context cl_context_properties props[3]; props[0] = (cl_context_properties) CL_CONTEXT_PLATFORM; props[1] = (cl_context_properties) platform; props[2] = (cl_context_properties) 0; cl_context GPUContext = clCreateContextFromType(props, CL_DEVICE_TYPE_GPU,NULL, NULL, NULL); //Context info size_t ParmDataBytes; clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes); cl_device_id* GPUDevices = (cl_device_id*)malloc(ParmDataBytes); clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL); // Create command-queue cl_command_queue GPUCommandQueue = clCreateCommandQueue(GPUContext, GPUDevices[0], 0, NULL); // Allocate GPU memory for source vectors cl_mem GPUVector1 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * N, inputVectorA, NULL); cl_mem GPUVector2 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * N, inputVectorB, NULL); // Allocate GPU memory for output vector cl_mem GPUOutputVector = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY,sizeof(int) * N, NULL, NULL); // Create OpenCL program with source code cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 7, OpenCLSource, NULL, NULL); // Build the program (OpenCL JIT compilation) clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL); // Create a handle to the compiled OpenCL function cl_kernel OpenCLVectorAdd = clCreateKernel(OpenCLProgram, "vectorAddition", NULL); // Set kernel arguments clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem), (void*)&GPUVector1); clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&GPUVector2); clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&GPUOutputVector); // Launch the kernel and measure execution time size_t WorkSize[1] = {N}; // one dimensional Range ts=GetTickCount(); clEnqueueNDRangeKernel(GPUCommandQueue, OpenCLVectorAdd, 1, NULL, WorkSize, NULL, 0, NULL, NULL); te=GetTickCount(); // Copy the output back to CPU memory int outputVector[N]; clEnqueueReadBuffer(GPUCommandQueue, GPUOutputVector, CL_TRUE, 0, N * sizeof(int), outputVector, 0, NULL, NULL); // Cleanup free(GPUDevices); clReleaseKernel(OpenCLVectorAdd); clReleaseProgram(OpenCLProgram); clReleaseCommandQueue(GPUCommandQueue); clReleaseContext(GPUContext); clReleaseMemObject(GPUVector1); clReleaseMemObject(GPUVector2); clReleaseMemObject(GPUOutputVector); for(int i = 0; i < N; i++) { cout << inputVectorA[i] <<"+"<< inputVectorB[i] <<"=" <<outputVector[i] << "\n"; } cout << "Kernel execution time on the GPU is: " << te-ts << "ms!\n"; system ("pause"); return 0; }