BarnacleJunior

Really Really Bad OpenCL performance on HD5850

Discussion created by BarnacleJunior on Dec 29, 2009
Latest reply on Jan 7, 2010 by LeeHowes
Is my initialization wrong?

I've been dealing with DirectCompute on 5800 for a while, but I've read that OpenCL gets much better performance.. So I did a line-for-line port of the first pass of a prefix sum/scan routine from cs_5_0 HLSL to OpenCL.  When run on a 1<<20 element array 1000 times, the velocity of the algorithm on DirectCompute is about 6550 million values per second.  Implementing a full prefix scan I've gotten a radix sort on DX11 that does 65 million pairs per second (on a 2M element array).  It's less than half what a GTX280 does with CUDPP, but it's in the ballpark.


My OpenCL code, however, is only doing 25 million elements per second on the very basic first-pass scan.  I'm running the test ten times in a row, so the card should have plenty of time to get to full speed.  Windowing graphics are also very sluggish and CPU utilization is 0%, so I'm sure it's not emulation.  I'm using the 9.12 hot fix driver on Win7 x64 RTM with a DDR2 PhenomII quadcore.

My prefix sum kernel has 128 threads with 8 values per thread - this is the configuration that produces the best performance on DirectCompute.  It's almost as if the LDS isn't working or something.  Or am I making an incredibly stupid mistake using the CL library?

Thanks,

.sean

 

The kernel code - scan.cl:

#define NUM_THREADS (1<< NUM_LEVELS)

// 32 channels means shift five and add
#define BANK_ADDRESS(i) (i + (i>> 5))

#define LOCAL_SIZE (BANK_ADDRESS(NUM_THREADS))

void ThreadSum(uint tid, __local uint sharedSum[LOCAL_SIZE]) {
    uint tid2 = BANK_ADDRESS(tid);
   
    for(uint d = 0; d < NUM_LEVELS - 1; ++d) {
        barrier(CLK_LOCAL_MEM_FENCE);
        uint mask = (2<< d) - 1;
        uint offset = 1<< d;
        if(mask == (mask & tid))
            sharedSum[tid2] += sharedSum[BANK_ADDRESS(tid - offset)];   
    }
    barrier(CLK_LOCAL_MEM_FENCE);
   
    if(0 == tid) {
        uint ai = BANK_ADDRESS(NUM_THREADS / 2 - 1);
        uint bi = BANK_ADDRESS(NUM_THREADS - 1);
       
        uint at = sharedSum[ai];
       
        sharedSum[ai] += sharedSum[bi];
        sharedSum[bi] += at + at;
    }

    for(uint d = NUM_LEVELS - 1; d; --d) {
        barrier(CLK_LOCAL_MEM_FENCE);
        uint mask = (1<< d) - 1;
        uint offset = 1<< (d - 1);
        if(mask == (mask & tid)) {
            uint t = sharedSum[tid2];
            uint r = BANK_ADDRESS(tid - offset);
            sharedSum[tid2] += sharedSum[r];
            sharedSum[r] = t;
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
}


uint4 Inclusive4Sum(uint4 vec) {
    vec.y += vec.x;
    vec.z += vec.y;
    vec.w += vec.z;
    return vec;
}

__kernel __attribute__((reqd_work_group_size(NUM_THREADS, 1, 1)))
void PrefixSumBlock_Pass1(
    __global uint* pass1_values,
    __global uint* pass1_partialSums) {
   
    __local uint sharedSum[LOCAL_SIZE];
   
    uint tid = get_local_id(0);
    uint gid = get_group_id(0);
   
    uint index = 8 * NUM_THREADS * gid;
    uint aTarget = index + 8 * tid;
   
    uint4 a[2];
   
    for(uint i = 0; i < 2; ++i) {
        a.x = pass1_values[aTarget + 4 * i + 0];
        a
.y = pass1_values[aTarget + 4 * i + 1];
        a.z = pass1_values[aTarget + 4 * i + 2];
        a
.w = pass1_values[aTarget + 4 * i + 3];   
    }
   
    uint4 aInc[2];
    aInc[0] = Inclusive4Sum(a[0]);
    aInc[1] = Inclusive4Sum(a[1]) + aInc[0].w;
   
    uint tid2 = BANK_ADDRESS(tid);
    sharedSum[tid2] = aInc[1].w;
   
    ThreadSum(tid, sharedSum);
   
    uint total = sharedSum[BANK_ADDRESS(0)];
    uint aExc = sharedSum[tid2] - total;
   
    uint4 aSum[2];
   
    for(uint i = 0; i < 2; ++i)
        aSum = aInc - a + aExc;
       
    for(uint i = 0; i < 2; ++i) {
        pass1_values[aTarget + 4 * i + 0] = aSum
.x;
        pass1_values[aTarget + 4 * i + 1] = aSum.y;
        pass1_values[aTarget + 4 * i + 2] = aSum
.z;
        pass1_values[aTarget + 4 * i + 3] = aSum.w;
    }
   
    if(0 == tid)
        pass1_partialSums[gid] = total;   
}

 

My cpp:

#pragma comment(lib, "opencl")
#include <windows.h>
#include <vector>
#include <string>
#include <fstream>
#include <iostream>
#include <cmath>

#define __CL_ENABLE_EXCEPTIONS
#include <cl/cl.h>

typedef unsigned int uint;

void CPUScan(const uint* source, uint* target, size_t numElements) {
    target[0] = 0;
    for(size_t i(1); i < numElements; ++i)
        target = target[i - 1] + source;
}

int main(int argc, char** argv) {
    std::ifstream f("scan.cl");
    std::string s(std::istreambuf_iterator<char>(f), std::istreambuf_iterator<char>(0));

    cl_uint numPlatforms;
    cl_platform_id platform = 0;
    cl_int status = clGetPlatformIDs(0, 0, &numPlatforms);

    std::vector<cl_platform_id> platforms(numPlatforms);
    status = clGetPlatformIDs(numPlatforms, &platforms[0], 0);

    cl_int err;
    cl_uint numDevices;
    err = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, 0, &numDevices);
    std::vector<cl_device_id> devices(numDevices);
    clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, numDevices, &devices[0], 0);

    cl_context_properties props[3] = {
        CL_CONTEXT_PLATFORM,
        (cl_context_properties)platforms[0],
        0
    };
    cl_context context = clCreateContextFromType(props, CL_DEVICE_TYPE_GPU, 0, 0, &err);

    const char* source = s.c_str();
    cl_program program = clCreateProgramWithSource(context, 1, &source, 0, &err);

    const int NumLevels = 7;
    err = clBuildProgram(program, 1, &devices[0], "-D NUM_LEVELS=7", 0, 0);

    if(err) {
        char buildLog[2048];
        err = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG,
           sizeof(buildLog), buildLog, 0);

        std::cout<<buildLog;
        return 0;
    }

    // get the kernel
    cl_kernel kernel = clCreateKernel(program, "PrefixSumBlock_Pass1",  &err);
    
    cl_uint numArgs;
    clGetKernelInfo(kernel, CL_KERNEL_NUM_ARGS, sizeof(cl_uint), &numArgs, 0);


    // create the buffers
    const size_t NumElements = 1<< 20;
    const int NumLoops = 1000;
    
    const uint ThreadsPerGroup = 1<< NumLevels;
    const uint ValuesPerThread = 8;
    const uint ValuesPerGroup = ValuesPerThread * ThreadsPerGroup;
    const uint NumGroups = NumElements / ValuesPerGroup;


    std::vector<uint> values(NumElements);
    for(size_t i(0); i < NumElements; ++i)
        values = 1;
    
    cl_mem pass1Values = clCreateBuffer(context, CL_MEM_READ_WRITE |
        CL_MEM_COPY_HOST_PTR, 4 * NumElements, &values[0], &err);
    cl_mem pass1PartialSums = clCreateBuffer(context, CL_MEM_READ_WRITE |
        0, 4 * NumGroups, 0, &err);

    cl_command_queue commandQueue = clCreateCommandQueue(context, devices[0], 0, &err);

    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &pass1Values);
    err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &pass1PartialSums);

    cl_uint workSize = NumElements / ValuesPerThread;
    cl_uint workGroupSize = ThreadsPerGroup;

    LARGE_INTEGER freq;
        LARGE_INTEGER begin, end;
    QueryPerformanceFrequency(&freq);
    double period = 1.0 / freq.QuadPart;

    for(int j(0); j < 10; ++j) {
        QueryPerformanceCounter(&begin);

        for(int i(0); i < NumLoops; ++i) {
            err = clEnqueueNDRangeKernel(commandQueue, kernel, 1, 0, &workSize, &workGroupSize,
                0, 0, 0);
            if(err) {
                printf("Err = %d on iteration %d\n", err, i);
                return 0;
            }
        }
        clEnqueueReadBuffer(commandQueue, pass1Values, CL_TRUE, 0, 4 * NumElements,
            &values[0], 0, 0, 0);

        QueryPerformanceCounter(&end);

        double elapsed = period * (end.QuadPart - begin.QuadPart);

        double velocity = NumElements * (NumLoops / elapsed);
        printf("GPU velocity: %1.3fM\n", velocity / 1.0e6);
    }

    std::vector<uint> partialSums(NumGroups);
    clEnqueueReadBuffer(commandQueue, pass1PartialSums, CL_TRUE, 0, 4 * NumGroups,
        &partialSums[0], 0, 0, 0);

    std::vector<uint> values2;
    values2.resize(NumElements);
    QueryPerformanceCounter(&begin);
    for(int i(0); i < NumLoops; ++i) {
        CPUScan(&values[0], &values2[0], NumElements);
        values.swap(values2);
    }
    QueryPerformanceCounter(&end);
    double elapsed = period * (end.QuadPart - begin.QuadPart);
    double velocity = NumElements * (NumLoops / elapsed);
    printf("CPU velocity: %1.3fM\n", velocity / 1.0e6);
}

 

Outcomes