cancel
Showing results for 
Search instead for 
Did you mean: 

Archives Discussions

timchist
Elite

clEnqueueWriteBufferRect does not work when region width is not equal to src pitch: broken again in Catalyst 14.12

I have already reported this problem in an earlier post: http://devgurus.amd.com/thread/160312 and the error has been fixed in Catalyst 13.4.

However, the error seems to be reintroduced in Catalyst 14.12.

The error can be reproduced on HD 7970 and R9 280X, but not on R9 290.

This has been working correctly in Catalyst 14.9.

I'm reposting the source code of a minimal sample demonstrating the error below:

---

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <CL/cl.h>

//------------------------------------------------------------------------------

void checkErr(char *func, cl_int err)

{

    if(err != CL_SUCCESS)

    {

        fprintf( stderr, "%s(): ", func );

        switch( err )

        {

            case CL_BUILD_PROGRAM_FAILURE:  fprintf (stderr, "CL_BUILD_PROGRAM_FAILURE"); break;

            case CL_COMPILER_NOT_AVAILABLE: fprintf (stderr, "CL_COMPILER_NOT_AVAILABLE"); break;

            case CL_DEVICE_NOT_AVAILABLE:   fprintf (stderr, "CL_DEVICE_NOT_AVAILABLE"); break;

            case CL_DEVICE_NOT_FOUND:       fprintf (stderr, "CL_DEVICE_NOT_FOUND"); break;

            case CL_INVALID_BINARY:         fprintf (stderr, "CL_INVALID_BINARY"); break;

            case CL_INVALID_BUILD_OPTIONS:  fprintf (stderr, "CL_INVALID_BUILD_OPTIONS"); break;

            case CL_INVALID_CONTEXT:        fprintf (stderr, "CL_INVALID_CONTEXT"); break;

            case CL_INVALID_DEVICE:         fprintf (stderr, "CL_INVALID_DEVICE"); break;

            case CL_INVALID_DEVICE_TYPE:    fprintf (stderr, "CL_INVALID_DEVICE_TYPE"); break;

            case CL_INVALID_OPERATION:      fprintf (stderr, "CL_INVALID_OPERATION"); break;

            case CL_INVALID_PLATFORM:       fprintf (stderr, "CL_INVALID_PLATFORM"); break;

            case CL_INVALID_PROGRAM:        fprintf (stderr, "CL_INVALID_PROGRAM"); break;

            case CL_INVALID_VALUE:          fprintf (stderr, "CL_INVALID_VALUE"); break;

            case CL_OUT_OF_HOST_MEMORY:     fprintf (stderr, "CL_OUT_OF_HOST_MEMORY"); break;

            default:                        fprintf (stderr, "Unknown error code: %d", (int)err); break;

        }

        fprintf (stderr, "\n");

        getchar();

        exit( err );

    }

}

int main(void)

{

    ///////////////////////////////////////////////////////////////////////////

    // Initialization

    ///////////////////////////////////////////////////////////////////////////

    int i = 0;

    cl_int err = CL_SUCCESS;

    cl_uint nPlatforms = 0;

    cl_platform_id *platforms = NULL;

    cl_platform_id platform = (cl_platform_id)NULL;

    cl_context_properties cprops[3];

    size_t nDevices = 0;

    cl_device_id *devices = NULL;

    size_t binary_size = 0;

    char * binary = NULL;

    cl_device_id device_id = 0;

    cl_context context;

    cl_command_queue queue, queue2;

    /* figure out the number of platforms on this system. */

    err = clGetPlatformIDs(0, NULL, &nPlatforms);

    checkErr( "clGetPlatformIDs", err );

    printf( "Number of platforms found: %d\n", nPlatforms );

    if( nPlatforms == 0 )

    {

        fprintf( stderr, "Cannot continue without any platforms. Exiting.\n" );

        return( -1 );

    }

    platforms = (cl_platform_id *)malloc( sizeof(cl_platform_id) * nPlatforms );

    err = clGetPlatformIDs( nPlatforms, platforms, NULL );

    checkErr( "clGetPlatformIDs", err );

    puts("Platforms:");

    for(cl_uint i = 0; i < nPlatforms; i++ )

    {

        char pbuf[100];

        err = clGetPlatformInfo( platforms, CL_PLATFORM_VENDOR,

                                 sizeof(pbuf), pbuf, NULL );

        checkErr( "clGetPlatformInfo", err );

        printf("#%d: %s\n", i, pbuf);

    }

    /* find the AMD platform. */

    for(cl_uint i = 0; i < nPlatforms; i++ )

    {

        char pbuf[100];

        err = clGetPlatformInfo( platforms, CL_PLATFORM_VENDOR,

                                 sizeof(pbuf), pbuf, NULL );

        checkErr( "clGetPlatformInfo", err );

        if( strcmp(pbuf, "Advanced Micro Devices, Inc.") == 0 )

        {

            printf( "Found AMD platform\n\n" );

            platform = platforms;

            break;

        }

    }

    if( platform == (cl_context_properties)NULL )

    {

        fprintf( stderr, "Could not find an AMD platform. Exiting.\n" );

        return( -1 );

    }

    cprops[0] = CL_CONTEXT_PLATFORM;

    cprops[1] = (cl_context_properties)platform;

    cprops[2] = (cl_context_properties)NULL; /* end of options list marker */

    /* create a context with all of the available devices. */

    context = clCreateContextFromType( cprops, CL_DEVICE_TYPE_GPU, NULL, NULL, &err );

    checkErr( "clCreateContextFromType", err );

    /* get a device count for this context. */

    err = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &nDevices );

    checkErr( "clGetContextInfo", err );

    nDevices = nDevices / sizeof(cl_device_id); /* need to generate actual device count from size of required buffer. */

    printf( "Number of devices found: %d\n", nDevices );

    devices = (cl_device_id *)malloc( sizeof(cl_device_id) * nDevices );

    if (nDevices == 0) {

        fprintf( stderr, "Could not find GPU devices. Exiting.\n" );

        return( -1 );

    }

    /* grab the handles to all of the devices in the context. */

    err = clGetContextInfo( context, CL_CONTEXT_DEVICES, sizeof(cl_device_id)*nDevices, devices, NULL );

    checkErr( "clGetContextInfo", err );

    device_id = devices[0];

    queue = clCreateCommandQueue(context, device_id, 0, &err);

    checkErr("clCreateCommandQueue", err);

    ///////////////////////////////////////////////////////////////////////////

    // The actual test

    ///////////////////////////////////////////////////////////////////////////

    const int FullImageWidth = 256;

    const int FullImageHeight = 256;

    const int PartialImageWidth = 16;

    const int PartialImageHeight = 16;

    unsigned char* hostFullImage = new unsigned char[FullImageWidth * FullImageHeight];

    for(int y = 0; y < FullImageHeight; ++y)

        for(int x = 0; x < FullImageWidth; ++x)

            hostFullImage[y * FullImageWidth + x] = y * FullImageWidth + x;

    cl_mem deviceBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE, PartialImageWidth * PartialImageHeight, NULL, &err);

    checkErr("clCreateBuffer", err);

    unsigned char pattern = 0;

    err = clEnqueueFillBuffer(queue, deviceBuffer, &pattern, 1, 0, PartialImageWidth * PartialImageHeight, 0, NULL, NULL);

    checkErr("clEnqueueFillBuffer", err);

  

    size_t bufferOrigin[3], hostOrigin[3], region[3];

    bufferOrigin[0] = 0;

    bufferOrigin[1] = 0;

    bufferOrigin[2] = 0;

    hostOrigin[0] = 0;

    hostOrigin[1] = 0;

    hostOrigin[2] = 0;

    region[0] = PartialImageWidth;

    region[1] = PartialImageHeight;

    region[2] = 1;

    err = clEnqueueWriteBufferRect(queue, deviceBuffer, CL_TRUE, bufferOrigin, hostOrigin, region,

                                   PartialImageWidth, 0, FullImageWidth, 0, hostFullImage, 0, NULL, NULL);

    checkErr("clEnqueueWriteBufferRect", err);

    unsigned char* hostPartialImage = new unsigned char[PartialImageWidth * PartialImageHeight];

    err = clEnqueueReadBuffer(queue, deviceBuffer, CL_TRUE, 0, PartialImageWidth * PartialImageHeight, hostPartialImage, 0, NULL, NULL);

    checkErr("clEnqueueReadBuffer", err);

    bool testPassed = true;

    for(int y = 0; y < PartialImageHeight; ++y)

    {

        for(int x = 0; x < PartialImageWidth; ++x)

            if(hostFullImage[y * FullImageWidth + x] != hostPartialImage[y * PartialImageWidth + x])

            {

                testPassed = false;

                break;

            }

        if(!testPassed)

            break;

    }

    if(testPassed)

        puts("Test passed, all OK");

    else

    {

        puts("Test failed.\n");

  

        puts("Expected:");

        for(int y = 0; y < PartialImageHeight; ++y)

        {

            for(int x = 0; x < PartialImageWidth; ++x)

                printf("%3d ", (int)hostFullImage[y * FullImageWidth + x]);

            puts("");

        }

        puts("\nActual:");

        for(int y = 0; y < PartialImageHeight; ++y)

        {

            for(int x = 0; x < PartialImageWidth; ++x)

                printf("%3d ", (int)hostPartialImage[y * PartialImageWidth + x]);

            puts("");

        }

    }

    ///////////////////////////////////////////////////////////////////////////

    // Clean-up

    ///////////////////////////////////////////////////////////////////////////

    err = clReleaseMemObject(deviceBuffer);

    checkErr("clReleaseMemObject", err);

    err = clReleaseCommandQueue(queue);

    checkErr("clReleaseCommandQueue", err);

    err = clReleaseContext(context);

    checkErr("clReleaseContext", err);

    delete hostFullImage;

    delete hostPartialImage;

    return 0;

}

Tags (2)
0 Likes
18 Replies
dipak
Staff
Staff

Re: clEnqueueWriteBufferRect does not work when region width is not equal to src pitch: broken again in Catalyst 14.12

Thanks for reporting this and posting the testcase. We'll check it and get back to you. BTW, where did you observe this problem - Windows or Linux or both?

Regards,

0 Likes
set
Adept I

Re: clEnqueueWriteBufferRect does not work when region width is not equal to src pitch: broken again in Catalyst 14.12

This looks extremely similar to my report: clEnqueueReadBufferRect/clEnqueueWriteBufferRect are broken in 14.12 driver , but in my case (HD6950 / Win7 x64, 32-bit program) it's broken even when region width is equal to src pitch.

0 Likes
timchist
Elite

Re: clEnqueueWriteBufferRect does not work when region width is not equal to src pitch: broken again in Catalyst 14.12

So far I only tested on Windows. Will try to test on Linux too when I get a chance. We found, however, that these functions work slower on Linux and it's faster to copy 2D areas via temporary host buffers, so they are not used in production code.

0 Likes
timchist
Elite

Re: clEnqueueWriteBufferRect does not work when region width is not equal to src pitch: broken again in Catalyst 14.12

Interesting. I did not check with width = pitch this time, will retest later and post results here. Earlier problem (with Catalyst < 13.4) only occurred when pitch was different from the width, so I did not even test this scenario now. Production code uses clEnqueueReadBuffer / clEnqueueWriteBuffer for optimization in such cases anyway.

0 Likes
dipak
Staff
Staff

Re: clEnqueueWriteBufferRect does not work when region width is not equal to src pitch: broken again in Catalyst 14.12

Hi,

I'm able to reproduce the issue on HD 7970 with omega catalyst driver (Win 7 64bit). It's also not working when both the buffers are same size i.e. 


    const int PartialImageWidth = 256;


    const int PartialImageHeight = 256;


However, its working fine after below modification (marked in bold) [please check this old thread  Using clEnqueueReadBufferRect to read a sub-matrix]

  cl_mem deviceBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, PartialImageWidth * PartialImageHeight, NULL, &err);

    checkErr("clCreateBuffer", err);

Please verify and let us know your observation.

Regards,

0 Likes
set
Adept I

Re: clEnqueueWriteBufferRect does not work when region width is not equal to src pitch: broken again in Catalyst 14.12

I can confirm that issue isn't present when buffer is created with CL_MEM_ALLOC_HOST_PTR.

Dipak, can you also look or file a report for OpenGL issue 2) I've reported in thread Several OpenGL bugs (about GL_SLUMINANCE8)? It's still present in Catalyst 14.12 and that section of forum looks completely ignored by AMD staff.

0 Likes
timchist
Elite

Re: clEnqueueWriteBufferRect does not work when region width is not equal to src pitch: broken again in Catalyst 14.12

The error indeed goes away when  CL_MEM_ALLOC_HOST_PTR flag is used, but in this case runtime will actually allocate memory on host, not on device, won't it? That's not what I want. I just want a buffer on a device, without any mapping to host memory. In fact, it's not quite clear from my sample code whether the contents of this memory will ever go to device, since I'm not launching any kernels: it's possible that it never leaves the host memory.

Is this going to be fixed in a future release? Could you provide an ETA?

Forgot to mention: copying rectangular areas in the opposite direction (from GPU to host) does not work in Catalyst 12.14 either, but worked fine before.

0 Likes
dipak
Staff
Staff

Re: clEnqueueWriteBufferRect does not work when region width is not equal to src pitch: broken again in Catalyst 14.12

Thanks for the confirmation.

I'm not sure what is the actual reason behind this issue. I've filed an internal bug report against it. Hope this will be fixed in future release but cannot comment about the timeline.

Regards,

0 Likes
dipak
Staff
Staff

Re: clEnqueueWriteBufferRect does not work when region width is not equal to src pitch: broken again in Catalyst 14.12

Thanks. I'll try to forward your OpenGL issue to concerned team.

0 Likes