Here is my question:
Suppose I have two input vectors(say A and B) and K is the out put vector.
A,B,C has size of 3 ( int*malloc(sizeof(int)*3))
I have defined A,B,C as pointer so I can Add A and B (like A+B) and the values can be stored in C
ex: A = 1,3,5 and B=3,5,7
the result K=4,8,12
I am able to write above problem without issue.
But suppose if I allocate A and B with size of int*3 but C with int*1. Is the below program works.
// System includes
#include <stdio.h>
#include <stdlib.h>
// OpenCL includes
#include <CL/cl.h>
// Project includes
// Constants, globals
const int ELEMENTS = 20; // elements in each vector
const int ELEMENTS1 = 1;
// Signatures
char* readSource(const char *sourceFilename);
int main(int argc, char ** argv)
{
printf("Running Vector Addition program\n\n");
size_t datasize = sizeof(int)*ELEMENTS;
size_t datasize1 = sizeof(int)*ELEMENTS1;
int *A, *B; // Input arrays
int *K; // Output array
// Allocate space for input/output data
A = (int*)malloc(datasize);
B = (int*)malloc(datasize);
K = (int*)malloc(datasize1);
if(A == NULL || B == NULL || K == NULL) {
perror("malloc");
exit(-1);
}
// Initialize the input data
for(int i = 0; i < ELEMENTS; i++) {
A = i;
B = i;
}
cl_int status; // use as return value for most OpenCL functions
cl_uint numPlatforms = 0;
cl_platform_id *platforms;
// Query for the number of recongnized platforms
status = clGetPlatformIDs(0, NULL, &numPlatforms);
if(status != CL_SUCCESS) {
printf("clGetPlatformIDs failed\n");
exit(-1);
}
// Make sure some platforms were found
if(numPlatforms == 0) {
printf("No platforms detected.\n");
exit(-1);
}
// Allocate enough space for each platform
platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));
if(platforms == NULL) {
perror("malloc");
exit(-1);
}
// Fill in platforms
clGetPlatformIDs(numPlatforms, platforms, NULL);
if(status != CL_SUCCESS) {
printf("clGetPlatformIDs failed\n");
exit(-1);
}
// Print out some basic information about each platform
printf("%u platforms detected\n", numPlatforms);
for(unsigned int i = 0; i < numPlatforms; i++) {
char buf[100];
printf("Platform %u: \n", i);
status = clGetPlatformInfo(platforms, CL_PLATFORM_VENDOR,
sizeof(buf), buf, NULL);
printf("\tVendor: %s\n", buf);
status |= clGetPlatformInfo(platforms, CL_PLATFORM_NAME,
sizeof(buf), buf, NULL);
printf("\tName: %s\n", buf);
if(status != CL_SUCCESS) {
printf("clGetPlatformInfo failed\n");
exit(-1);
}
}
printf("\n");
cl_uint numDevices = 0;
cl_device_id *devices;
// Retrive the number of devices present
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, NULL,
&numDevices);
if(status != CL_SUCCESS) {
printf("clGetDeviceIDs failed\n");
exit(-1);
}
// Make sure some devices were found
if(numDevices == 0) {
printf("No devices detected.\n");
exit(-1);
}
// Allocate enough space for each device
devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id));
if(devices == NULL) {
perror("malloc");
exit(-1);
}
// Fill in devices
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, numDevices,
devices, NULL);
if(status != CL_SUCCESS) {
printf("clGetDeviceIDs failed\n");
exit(-1);
}
// Print out some basic information about each device
printf("%u devices detected\n", numDevices);
for(unsigned int i = 0; i < numDevices; i++) {
char buf[100];
printf("Device %u: \n", i);
status = clGetDeviceInfo(devices, CL_DEVICE_VENDOR,
sizeof(buf), buf, NULL);
printf("\tDevice: %s\n", buf);
status |= clGetDeviceInfo(devices, CL_DEVICE_NAME,
sizeof(buf), buf, NULL);
printf("\tName: %s\n", buf);
if(status != CL_SUCCESS) {
printf("clGetDeviceInfo failed\n");
exit(-1);
}
}
printf("\n");
cl_context context;
// Create a context and associate it with the devices
context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);
if(status != CL_SUCCESS || context == NULL) {
printf("clCreateContext failed\n");
exit(-1);
}
cl_command_queue cmdQueue;
// Create a command queue and associate it with the device you
// want to execute on
cmdQueue = clCreateCommandQueue(context, devices[0], 0, &status);
if(status != CL_SUCCESS || cmdQueue == NULL) {
printf("clCreateCommandQueue failed\n");
exit(-1);
}
cl_mem d_A, d_B; // Input buffers on device
cl_mem d_C; // Output buffer on device
// Create a buffer object (d_A) that contains the data from the host ptr A
d_A = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,
datasize, A, &status);
if(status != CL_SUCCESS || d_A == NULL) {
printf("clCreateBuffer failed\n");
exit(-1);
}
// Create a buffer object (d_B) that contains the data from the host ptr B
d_B = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,
datasize, B, &status);
if(status != CL_SUCCESS || d_B == NULL) {
printf("clCreateBuffer failed\n");
exit(-1);
}
// Create a buffer object (d_C) with enough space to hold the output data
d_C = clCreateBuffer(context, CL_MEM_READ_WRITE,
datasize1, NULL, &status);
if(status != CL_SUCCESS || d_C == NULL) {
printf("clCreateBuffer failed\n");
exit(-1);
}
cl_program program;
char *source;
const char *sourceFile = "vectoradd.cl";
// This function reads in the source code of the program
source = readSource(sourceFile);
//printf("Program source is:\n%s\n", source);
// Create a program. The 'source' string is the code from the
// vectoradd.cl file.
program = clCreateProgramWithSource(context, 1, (const char**)&source,
NULL, &status);
if(status != CL_SUCCESS) {
printf("clCreateProgramWithSource failed\n");
exit(-1);
}
cl_int buildErr;
// Build (compile & link) the program for the devices.
// Save the return value in 'buildErr' (the following
// code will print any compilation errors to the screen)
buildErr = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL);
// If there are build errors, print them to the screen
if(buildErr != CL_SUCCESS) {
printf("Program failed to build.\n");
cl_build_status buildStatus;
for(unsigned int i = 0; i < numDevices; i++) {
clGetProgramBuildInfo(program, devices, CL_PROGRAM_BUILD_STATUS,
sizeof(cl_build_status), &buildStatus, NULL);
if(buildStatus == CL_SUCCESS) {
continue;
}
char *buildLog;
size_t buildLogSize;
clGetProgramBuildInfo(program, devices, CL_PROGRAM_BUILD_LOG,
0, NULL, &buildLogSize);
buildLog = (char*)malloc(buildLogSize);
if(buildLog == NULL) {
perror("malloc");
exit(-1);
}
clGetProgramBuildInfo(program, devices, CL_PROGRAM_BUILD_LOG,
buildLogSize, buildLog, NULL);
buildLog[buildLogSize-1] = '\0';
printf("Device %u Build Log:\n%s\n", i, buildLog);
free(buildLog);
}
exit(0);
}
else {
printf("No build errors\n");
}
cl_kernel kernel;
// Create a kernel from the vector addition function (named "vecadd")
kernel = clCreateKernel(program, "vecadd", &status);
if(status != CL_SUCCESS) {
printf("clCreateKernel failed\n");
exit(-1);
}
// Associate the input and output buffers with the kernel
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_A);
status |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_B);
status |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_C);
if(status != CL_SUCCESS) {
printf("clSetKernelArg failed\n");
exit(-1);
}
// Define an index space (global work size) of threads for execution.
// A workgroup size (local work size) is not required, but can be used.
size_t globalWorkSize[1]; // There are ELEMENTS threads
globalWorkSize[0] = ELEMENTS;
// Execute the kernel.
// 'globalWorkSize' is the 1D dimension of the work-items
status = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, globalWorkSize,
NULL, 0, NULL, NULL);
if(status != CL_SUCCESS) {
printf("clEnqueueNDRangeKernel failed\n");
exit(-1);
}
// Read the OpenCL output buffer (d_C) to the host output array (K)
clEnqueueReadBuffer(cmdQueue, d_C, CL_TRUE, 0, datasize, K,
0, NULL, NULL);
// Verify correctness
bool result = true;
for(int i = 0; i < ELEMENTS1; i++) {
printf("%d \n", K);
if(K != i+i) {
result = false;
break;
}
}
if(result) {
printf("Output is correct\n");
}
else {
printf("Output is incorrect\n");
}
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);
clReleaseMemObject(d_A);
clReleaseMemObject(d_B);
clReleaseMemObject(d_C);
clReleaseContext(context);
free(A);
free(B);
free(K);
free(source);
free(platforms);
free(devices);
}
char* readSource(const char *sourceFilename) {
FILE *fp;
int err;
int size;
char *source;
fp = fopen(sourceFilename, "rb");
if(fp == NULL) {
printf("Could not open kernel file: %s\n", sourceFilename);
exit(-1);
}
err = fseek(fp, 0, SEEK_END);
if(err != 0) {
printf("Error seeking to end of file\n");
exit(-1);
}
size = ftell(fp);
if(size < 0) {
printf("Error getting file position\n");
exit(-1);
}
err = fseek(fp, 0, SEEK_SET);
if(err != 0) {
printf("Error seeking to start of file\n");
exit(-1);
}
source = (char*)malloc(size+1);
if(source == NULL) {
printf("Error allocating %d bytes for the program source\n", size+1);
exit(-1);
}
err = fread(source, 1, size, fp);
if(err != size) {
printf("only read %d bytes\n", err);
exit(0);
}
source[size] = '\0';
return source;
}
Kernel:
__kernel
void vecadd(__global int *A,
__global int *B,
__global int *K) {
int idx = get_global_id(0);
//int idx1 = get_global_id(1);
K[idx] = A[idx] + B[idx];
}
First this post should probably be in the OpenCL forum - but the short answer is it wil not work with that kernel code.
You are referencing K[0], K[1] and K{2] but only allocating K{0];