使用 opencl 解决经典的 Map-Reduce 问题？

发布于 2024-08-26 23:49:42 字数 2952 浏览 11 评论 0原文

我正在尝试将经典的映射归约问题（可以与 MPI 很好地并行）与 OpenCL（即 AMD 实现）并行。但结果让我很困扰。

我先简单介绍一下问题。有两种类型的数据流入系统：特征集（每个 30 个参数）和样本集（每个 9000+ 维度）。这是一个经典的映射归约问题，因为我需要计算每个样本（映射）上每个特征的分数。然后，总结每个特征的总体得分（减少）。大约有 10k 个特征和 30k 个样本。

我尝试了不同的方法来解决这个问题。首先，我尝试按特征分解问题。问题是分数计算由随机内存访问组成（选择 9000 多个维度中的一些并进行加/减计算）。由于我无法合并内存访问，因此会产生成本。然后，我尝试通过样本来分解问题。问题是，为了总结总得分，所有线程都在竞争很少的得分变量。它不断地覆盖原来不正确的分数。（我不能先进行个人评分，然后再总结，因为它需要10k * 30k * 4字节）。

我尝试的第一种方法在 8 线程 i7 860 CPU 上提供了相同的性能。然而，我不认为这个问题是无法解决的：它与光线追踪问题非常相似（为此你需要对数百万条三角形进行数百万条光线的计算）。有什么想法吗？

此外，我发布了一些我拥有的代码：

按功能分解（有效，但速度慢）：

__kernel void __ccv_cl_pos_error_rate(__global unsigned int* err_rate,
__constant int* feature, __constant int* data, int num, __constant
unsigned int* w, int s, int isiz0, int isiz01, int step0, int step1)
{
    int igrid = get_global_id(0);
    __constant int* of = feature + igrid * 30;
    unsigned int e = 0;
    int k, i;
    int step[] = { step0, step1 };
    for (k = 0; k < num; k++)
    {
        __constant int* kd = data + k * isiz01;
        int pmin = kd[of[0] * isiz0 + of[1] + of[2] * step[of[0]]];
        int nmax = kd[of[3] * isiz0 + of[4] + of[5] * step[of[3]]];
        for (i = 0; i < 5; i++)
        {
            if (of[i * 6] >= 0)
                pmin = min(pmin, kd[of[i * 6] * isiz0 + of[i * 6 + 1] + of[i * 6 + 2] * step[of[i * 6]]]);
            if (of[i * 6 + 3] >= 0)
                nmax = max(nmax, kd[of[i * 6 + 3] * isiz0 + of[i * 6 + 4] + of[i * 6 + 5] * step[of[i * 6 + 3]]]);
        }
        if (pmin <= nmax)
            e += w[s + k];
    }
    err_rate[igrid] += e;
}

按样本分解，不起作用：


__kernel void __ccv_cl_pos_error_rate(__global unsigned int* err_rate,
__constant int* feature, __constant int* data, int num, __constant
unsigned int* w, int s, int isiz0, int isiz01, int step0, int step1,
__local int* shared)
{
    int igrid = get_global_id(0);
    int lsize = get_local_size(0);
    int lid = get_local_id(0);
    unsigned int e = 0;
    int k, i;
    int ws = w[s + igrid];
    int step[] = { step0, step1 };
    for (k = 0; k < isiz01; k += lsize)
        if (k + lid < isiz01)
            shared[k + lid] = data[igrid * isiz01 + k + lid];
    barrier(....);
    for (k = 0; k < num; k++)
    {
        __constant int* of = feature + k * 30;
        int pmin = shared[of[0] * isiz0 + of[1] + of[2] * step[of[0]]];
        int nmax = shared[of[3] * isiz0 + of[4] + of[5] * step[of[3]]];
        for (i = 0; i < 5; i++)
        {
            if (of[i * 6] >= 0)
                pmin = min(pmin, shared[of[i * 6] * isiz0 + of[i * 6 + 1] + of[i * 6 + 2] * step[of[i * 6]]]);
            if (of[i * 6 + 3] >= 0)
                nmax = max(nmax, shared[of[i * 6 + 3] * isiz0 + of[i * 6 + 4] + of[i * 6 + 5] * step[of[i * 6 + 3]]]);
        }
        if (pmin <= nmax)
            err_rate[k] += ws; // here is wrong.
    }
    barrier(....);
}

原文

I am trying to parallel a classic map-reduce problem (which can parallel well with MPI) with OpenCL, namely, the AMD implementation. But the result bothers me.

Let me brief about the problem first. There are two type of data that flow into the system: the feature set (30 parameters for each) and the sample set (9000+ dimensions for each). It is a classic map-reduce problem in the sense that I need to calculate the score of every feature on every sample (Map). And then, sum up the overall score for every feature (Reduce). There are around 10k features and 30k samples.

I tried different ways to solve the problem. First, I tried to decompose the problem by features. The problem is that the score calculation consists of random memory access (pick some of the 9000+ dimensions and do plus/subtraction calculations). Since I cannot coalesce memory access, it costs. Then, I tried to decompose the problem by samples. The problem is that to sum up overall score, all threads are competing for few score variables. It keeps overwriting the score which turns out to be incorrect. (I cannot carry out individual score first and sum up later because it requires 10k * 30k * 4 bytes).

The first method I tried gives me the same performance on i7 860 CPU with 8 threads. However, I don't think the problem is unsolvable: it is remarkably similar to ray tracing problem (for which you carry out calculation that millions of rays against millions of triangles). Any ideas?

In addition, I am posting some of the code I have:

decompose by feature (works, but slow):

__kernel void __ccv_cl_pos_error_rate(__global unsigned int* err_rate,
__constant int* feature, __constant int* data, int num, __constant
unsigned int* w, int s, int isiz0, int isiz01, int step0, int step1)
{
    int igrid = get_global_id(0);
    __constant int* of = feature + igrid * 30;
    unsigned int e = 0;
    int k, i;
    int step[] = { step0, step1 };
    for (k = 0; k < num; k++)
    {
        __constant int* kd = data + k * isiz01;
        int pmin = kd[of[0] * isiz0 + of[1] + of[2] * step[of[0]]];
        int nmax = kd[of[3] * isiz0 + of[4] + of[5] * step[of[3]]];
        for (i = 0; i < 5; i++)
        {
            if (of[i * 6] >= 0)
                pmin = min(pmin, kd[of[i * 6] * isiz0 + of[i * 6 + 1] + of[i * 6 + 2] * step[of[i * 6]]]);
            if (of[i * 6 + 3] >= 0)
                nmax = max(nmax, kd[of[i * 6 + 3] * isiz0 + of[i * 6 + 4] + of[i * 6 + 5] * step[of[i * 6 + 3]]]);
        }
        if (pmin <= nmax)
            e += w[s + k];
    }
    err_rate[igrid] += e;
}

decompose by sample, not work:


__kernel void __ccv_cl_pos_error_rate(__global unsigned int* err_rate,
__constant int* feature, __constant int* data, int num, __constant
unsigned int* w, int s, int isiz0, int isiz01, int step0, int step1,
__local int* shared)
{
    int igrid = get_global_id(0);
    int lsize = get_local_size(0);
    int lid = get_local_id(0);
    unsigned int e = 0;
    int k, i;
    int ws = w[s + igrid];
    int step[] = { step0, step1 };
    for (k = 0; k < isiz01; k += lsize)
        if (k + lid < isiz01)
            shared[k + lid] = data[igrid * isiz01 + k + lid];
    barrier(....);
    for (k = 0; k < num; k++)
    {
        __constant int* of = feature + k * 30;
        int pmin = shared[of[0] * isiz0 + of[1] + of[2] * step[of[0]]];
        int nmax = shared[of[3] * isiz0 + of[4] + of[5] * step[of[3]]];
        for (i = 0; i < 5; i++)
        {
            if (of[i * 6] >= 0)
                pmin = min(pmin, shared[of[i * 6] * isiz0 + of[i * 6 + 1] + of[i * 6 + 2] * step[of[i * 6]]]);
            if (of[i * 6 + 3] >= 0)
                nmax = max(nmax, shared[of[i * 6 + 3] * isiz0 + of[i * 6 + 4] + of[i * 6 + 5] * step[of[i * 6 + 3]]]);
        }
        if (pmin <= nmax)
            err_rate[k] += ws; // here is wrong.
    }
    barrier(....);
}

分享到QQ

分享到微博