CPU缓存如何影响C程序的性能

发布于 2025-01-20 06:52:21 字数 3737 浏览 2 评论 0原文

我正在尝试更多地了解CPU缓存如何影响性能。作为一个简单的测试，我将矩阵的第一列的值求和，总列数量不同。

// compiled with: gcc -Wall -Wextra -Ofast -march=native cache.c
// tested with: for n in {1..100}; do ./a.out $n; done | tee out.csv
#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

double sum_column(uint64_t ni, uint64_t nj, double const data[ni][nj])
{
    double sum = 0.0;
    for (uint64_t i = 0; i < ni; ++i) {
        sum += data[i][0];
    }
    return sum;
}

int compare(void const* _a, void const* _b)
{
    double const a = *((double*)_a);
    double const b = *((double*)_b);
    return (a > b) - (a < b);
}

int main(int argc, char** argv)
{
    // set sizes
    assert(argc == 2);
    uint64_t const iter_max = 101;
    uint64_t const ni       = 1000000;
    uint64_t const nj       = strtol(argv[1], 0, 10);

    // initialize data
    double(*data)[nj] = calloc(ni, sizeof(*data));
    for (uint64_t i = 0; i < ni; ++i) {
        for (uint64_t j = 0; j < nj; ++j) {
            data[i][j] = rand() / (double)RAND_MAX;
        }
    }

    // test performance
    double* dt        = calloc(iter_max, sizeof(*dt));
    double const sum0 = sum_column(ni, nj, data);
    for (uint64_t iter = 0; iter < iter_max; ++iter) {
        clock_t const t_start = clock();
        double const sum      = sum_column(ni, nj, data);
        clock_t const t_stop  = clock();
        assert(sum == sum0);
        dt[iter] = (t_stop - t_start) / (double)CLOCKS_PER_SEC;
    }

    // sort dt
    qsort(dt, iter_max, sizeof(*dt), compare);

    // compute mean dt
    double dt_mean = 0.0;
    for (uint64_t iter = 0; iter < iter_max; ++iter) {
        dt_mean += dt[iter];
    }
    dt_mean /= iter_max;

    // print results
    printf("%2lu %.8e %.8e %.8e %.8e\n", nj, dt[iter_max / 2], dt_mean, dt[0],
        dt[iter_max - 1]);

    // free memory
    free(data);
}

但是，结果并不是我期望它们的方式：

据我了解，当CPU从data加载值时，它还将以下一些值放在data中缓存。确切的数字取决于缓存线的大小（我的计算机上的64个字节）。这可以解释为什么要生长nj解决方案的时间首先线性增加，并以某种价值排列。如果nj == 1，一个负载将下一个7个值放置在缓存中，因此我们只需要从RAM加载每个8个值即可。如果nj == 2，遵循相同的逻辑，我们需要每4个值访问RAM。经过一定的尺寸，我们将不得不为每个值访问RAM，这应该导致性能升级。我的猜测是，对于图表的线性部分要远远超过4的原因是，实际上，这里有多个级别的缓存工作，而值最终在这些缓存中的方式比我在这里解释的要复杂得多。

我无法解释的是，为什么在16的倍数上有这些性能峰。

在考虑了这个问题之后，我决定检查是否也发生了nj的更高值：

实际上。而且，还有更多：为什么〜250之后的性能再次提高？

有人可以向我解释，或者将我指向一些适当的参考，为什么会有这些峰以及为什么nj的较高值的性能提高。

如果您想自己尝试代码，为您的方便起见，我还将附上我的绘图脚本：

import numpy as np
import matplotlib.pyplot as plt

data = np.genfromtxt("out.csv")
data[:,1:] /= data[0,1]

dy = np.diff(data[:,2]) / np.diff(data[:,0])
for i in range(len(dy) - 1):
    if dy[i] - dy[i + 1] > (dy.max() - dy.min()) / 2:
        plt.axvline(data[i + 1,0], color='gray', linestyle='--')
        plt.text(data[i + 1,0], 1.5 * data[0,3], f"{int(data[i + 1,0])}",
                 rotation=0, ha="center", va="center",
                 bbox=dict(boxstyle="round", ec='gray', fc='w'))

plt.fill_between(data[:,0], data[:,3], data[:,4], color='gray')
plt.plot(data[:,0], data[:,1], label="median")
plt.plot(data[:,0], data[:,2], label="mean")
plt.legend(loc="upper left")
plt.xlabel("nj")
plt.ylabel("dt / dt$_0$")
plt.savefig("out.pdf")

原文

I am trying to understand more about how CPU cache affects performance. As a simple test I am summing the values of the first column of a matrix with varying numbers of total columns.

// compiled with: gcc -Wall -Wextra -Ofast -march=native cache.c
// tested with: for n in {1..100}; do ./a.out $n; done | tee out.csv
#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

double sum_column(uint64_t ni, uint64_t nj, double const data[ni][nj])
{
    double sum = 0.0;
    for (uint64_t i = 0; i < ni; ++i) {
        sum += data[i][0];
    }
    return sum;
}

int compare(void const* _a, void const* _b)
{
    double const a = *((double*)_a);
    double const b = *((double*)_b);
    return (a > b) - (a < b);
}

int main(int argc, char** argv)
{
    // set sizes
    assert(argc == 2);
    uint64_t const iter_max = 101;
    uint64_t const ni       = 1000000;
    uint64_t const nj       = strtol(argv[1], 0, 10);

    // initialize data
    double(*data)[nj] = calloc(ni, sizeof(*data));
    for (uint64_t i = 0; i < ni; ++i) {
        for (uint64_t j = 0; j < nj; ++j) {
            data[i][j] = rand() / (double)RAND_MAX;
        }
    }

    // test performance
    double* dt        = calloc(iter_max, sizeof(*dt));
    double const sum0 = sum_column(ni, nj, data);
    for (uint64_t iter = 0; iter < iter_max; ++iter) {
        clock_t const t_start = clock();
        double const sum      = sum_column(ni, nj, data);
        clock_t const t_stop  = clock();
        assert(sum == sum0);
        dt[iter] = (t_stop - t_start) / (double)CLOCKS_PER_SEC;
    }

    // sort dt
    qsort(dt, iter_max, sizeof(*dt), compare);

    // compute mean dt
    double dt_mean = 0.0;
    for (uint64_t iter = 0; iter < iter_max; ++iter) {
        dt_mean += dt[iter];
    }
    dt_mean /= iter_max;

    // print results
    printf("%2lu %.8e %.8e %.8e %.8e\n", nj, dt[iter_max / 2], dt_mean, dt[0],
        dt[iter_max - 1]);

    // free memory
    free(data);
}

However, the results are not quite how I would expect them to be:

As far as I understand, when the CPU loads a value from data, it also places some of the following values of data in the cache. The exact number depends on the cache line size (64 byte on my machine). This would explain, why with growing nj the time to solution first increases linearly and levels out at some value. If nj == 1, one load places the next 7 values in the cache and thus we only need to load from RAM every 8th value. If nj == 2, following the same logic, we need to access the RAM every 4th value. After some size, we will have to access the RAM for every value, which should result in the performance leveling out. My guess, for why the linear section of the graph goes further than 4 is that in reality there are multiple levels of cache at work here and the way that values end up in these caches is a little more complex than what I explained here.

What I cannot explain is why there are these performance peaks at multiples of 16.

After thinking about this question for a bit, I decided to check if this also occurs for higher values of nj:

In fact, it does. And, there is more: Why does the performance increase again after ~250?

Could someone explain to me, or point me to some appropriate reference, why there are these peaks and why the performance increases for higher values of nj.

If you would like to try the code for yourself, I will also attach my plotting script, for your convenience:

import numpy as np
import matplotlib.pyplot as plt

data = np.genfromtxt("out.csv")
data[:,1:] /= data[0,1]

dy = np.diff(data[:,2]) / np.diff(data[:,0])
for i in range(len(dy) - 1):
    if dy[i] - dy[i + 1] > (dy.max() - dy.min()) / 2:
        plt.axvline(data[i + 1,0], color='gray', linestyle='--')
        plt.text(data[i + 1,0], 1.5 * data[0,3], f"{int(data[i + 1,0])}",
                 rotation=0, ha="center", va="center",
                 bbox=dict(boxstyle="round", ec='gray', fc='w'))

plt.fill_between(data[:,0], data[:,3], data[:,4], color='gray')
plt.plot(data[:,0], data[:,1], label="median")
plt.plot(data[:,0], data[:,2], label="mean")
plt.legend(loc="upper left")
plt.xlabel("nj")
plt.ylabel("dt / dt$_0quot;)
plt.savefig("out.pdf")

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

灼疼热情 2025-01-27 06:52:21

这些图显示了几种复杂的低级影响的组合（主要是缓存垃圾和预取问题）。我假设目标平台是一款主流现代处理器，具有 64 字节缓存行（通常是 x86）。

我可以在我的 i5-9600KF 处理器上重现该问题。这是结果图：

首先，当nj很小时，获取的地址之间的差距（即步幅）很小，缓存行也很小。相对有效地使用。例如，当nj = 1时，访问是连续的。在这种情况下，处理器可以有效地从 DRAM 中预取缓存行，从而隐藏其高延迟。还有一个很好的空间缓存局部性，因为许多连续的项目共享相同的缓存行。当nj=2时，仅使用缓存行的一半值。这意味着对于相同数量的操作，请求的缓存行数量是两倍。话虽这么说，由于将两个浮点数相加的延迟相对较高，导致计算密集代码，时间并没有长很多。您可以展开循环 4 次并使用 4 个不同的总和变量，以便（主流现代）处理器可以并行添加多个值。请注意，大多数处理器还可以在每个周期从缓存加载多个值。当 nj = 4 时，每 2 个周期请求一个新的缓存行（因为 double 占用 8 个字节）。结果，内存吞吐量可能变得如此之大，以至于计算变得内存限制。人们可能期望 nj >= 8 的时间是稳定的，因为请求的缓存行的数量应该相同，但实际上处理器预取多个连续的缓存行，因此不必支付 DRAM 延迟的开销，在这种情况下，延迟是巨大的。预取缓存行的数量通常在 2 到 4 之间（据我所知，当步幅大于 512 时，这种预取策略在 Intel 处理器上被禁用，因此当 nj >= 64 时）。这解释了为什么时序当 nj < 32 时急剧增加，并且在 32 <= nj <= 256 时变得相对稳定，但以下情况除外。

当 nj 是 16 的倍数时出现的常规峰值是由于称为缓存抖动的复杂缓存效应而产生的 en.wikipedia.org/wiki/CPU_cache#Associativity" rel="noreferrer">N 路关联，N 通常在 4 到 16 之间。例如，以下是我的 i5-9600KF 的统计数据处理器：

Cache 0: L1 data cache,        line size 64,  8-ways,    64 sets, size 32k 
Cache 1: L1 instruction cache, line size 64,  8-ways,    64 sets, size 32k 
Cache 2: L2 unified cache,     line size 64,  4-ways,  1024 sets, size 256k 
Cache 3: L3 unified cache,     line size 64, 12-ways, 12288 sets, size 9216k

这意味着，如果 (A1 % 32768) / 64 == (A2 % 32768) / 64 ，则从 DRAM 获取的两个分别具有地址 A1 和 A2 的值可能会导致 L1 缓存中发生冲突。在这种情况下，处理器需要从一组 N=8 缓存行中选择要替换的缓存行。有很多缓存替换策略，但没有一个是完美的。因此，某些有用的缓存行有时会过早被逐出，从而导致稍后需要额外的缓存未命中。在病理情况下，许多 DRAM 位置可能会竞争相同的缓存行，从而导致过多的缓存未命中。有关此内容的更多信息也可以在这篇文章中找到。

对于nj步长，L1缓存中可以有效使用的缓存线数量是有限的。例如，如果所有获取的值具有相同的地址模数和高速缓存大小，则实际上只能使用 N 个高速缓存行（即，对于我的处理器而言为 8 个）来存储所有值。可用的缓存行较少是一个大问题，因为预取器需要缓存中相当大的空间，以便存储稍后需要的许多缓存行。 并发提取数量越少，内存吞吐量越低。此处尤其如此，因为从 DRAM 获取 1 个缓存行的延迟约为几十纳秒（例如 ~70 ns），而其带宽约为几十 GiB/s（例如 ~40 GiB/s）：应同时获取高速缓存行（例如~40），以隐藏延迟并使 DRAM 饱和。

下面是对nj的值在我的L1缓存中实际可以使用的缓存行数的模拟：

 nj  #cache-lines
  1      512
  2      512
  3      512
  4      512
  5      512
  6      512
  7      512
  8      512
  9      512
 10      512
 11      512
 12      512
 13      512
 14      512
 15      512
 16      256    <----
 17      512
 18      512
 19      512
 20      512
 21      512
 22      512
 23      512
 24      512
 25      512
 26      512
 27      512
 28      512
 29      512
 30      512
 31      512
 32      128    <----
 33      512
 34      512
 35      512
 36      512
 37      512
 38      512
 39      512
 40      512
 41      512
 42      512
 43      512
 44      512
 45      512
 46      512
 47      512
 48      256    <----
 49      512
 50      512
 51      512
 52      512
 53      512
 54      512
 55      512
 56      512
 57      512
 58      512
 59      512
 60      512
 61      512
 62      512
 63      512
 64       64    <----
==============
 80      256
 96      128
112      256
128       32
144      256
160      128
176      256
192       64
208      256
224      128
240      256
256       16
384       32
512        8
1024       4

我们可以看到当nj时可用的缓存行数更小是 16 的倍数。在这种情况下，预加载器会将数据预加载到可能会被后续提取（同时完成）提前逐出的缓存行中。当可用缓存行数量较少时，代码中执行的加载指令更有可能导致缓存未命中。当发生缓存未命中时，需要再次从 L2 甚至 L3 获取该值，从而导致执行速度变慢。请注意，L2 缓存也受到相同的影响，但由于其较大而不太明显。 现代 x86 处理器的 L3 缓存利用散列来更好地分配事物，以减少固定步幅的冲突（至少在 Intel 处理器上，当然在 AMD 上也是如此，尽管 AFAIK 这是未记录）。

以下是我的机器上一些峰值的计时：

  32 4.63600000e-03 4.62298020e-03 4.06400000e-03 4.97300000e-03
  48 4.95800000e-03 4.96994059e-03 4.60400000e-03 5.59800000e-03
  64 5.01600000e-03 5.00479208e-03 4.26900000e-03 5.33100000e-03
  96 4.99300000e-03 5.02284158e-03 4.94700000e-03 5.29700000e-03
 128 5.23300000e-03 5.26405941e-03 4.93200000e-03 5.85100000e-03
 192 4.76900000e-03 4.78833663e-03 4.60100000e-03 5.01600000e-03
 256 5.78500000e-03 5.81666337e-03 5.77600000e-03 6.35300000e-03
 384 5.25900000e-03 5.32504950e-03 5.22800000e-03 6.75800000e-03
 512 5.02700000e-03 5.05165347e-03 5.02100000e-03 5.34400000e-03
1024 5.29200000e-03 5.33059406e-03 5.28700000e-03 5.65700000e-03

正如预期的那样，在可用缓存行数量少得多的情况下，实践中总体计时会更大。然而，当 nj >= 512 时，结果令人惊讶，因为它们比其他的要快得多。在这种情况下，可用高速缓存行的数量等于关联方式的数量 (N)。我的猜测是，这是因为英特尔处理器肯定会检测到这种病态情况并优化预取，以减少缓存未命中的数量（使用行填充缓冲区绕过 L1 缓存 - 见下文）。

最后，对于较大的 nj 步幅，较大的 nj 会导致较高的开销，这主要是由于翻译后备缓冲区 (TLB)：nj 越大，需要翻译的页地址越多，TLB 条目的数量有限。事实上，这是我在我的机器上可以观察到的情况：与目标平台不同，时间往往以非常稳定的方式缓慢增加。

我还无法真正解释这种非常奇怪的行为。
以下是一些疯狂的猜测：

当 nj 很大时，操作系统可能倾向于使用更多的大页面（以便减少 TLB 的开销），因为分配了更宽的块。这可能会导致预取器有更多的并发性，因为据我所知它无法跨页
边界。您可以尝试检查分配的（透明）大页面的数量（通过在 Linux 中查看 /proc/meminfo 中的 AnonHugePages ）或在这种情况下强制使用它们（使用显式的memmap），或者可能通过禁用它们。我的系统似乎使用 2 MiB 透明大页面，与 nj 值无关。
如果目标体系结构是 NUMA 体系结构（例如，新的 AMD 处理器或具有多个处理器且拥有自己的内存的服务器），则操作系统可以分配物理存储在另一个 NUMA 节点上的页面，因为当前 NUMA 节点上的可用空间较少。由于吞吐量更大，这可能会带来更高的性能（尽管延迟更高）。您可以在 Linux 上使用 numactl 控制此策略，以强制进行本地分配。

有关此主题的更多信息，请阅读精彩文档每个程序员都应该了解内存知识。此外，还有一篇关于 x86 缓存在实践中如何工作的非常好的文章此处。

消除峰值

要消除 x86 处理器上由于缓存垃圾造成的峰值，您可以使用非临时软件预取指令，以便可以在非临时缓存结构中将缓存行提取到接近不应在 L1 中导致缓存垃圾的处理器（如果可能）。这种缓存结构通常是 Intel 处理器上的行填充缓冲区 (LFB) 和 AMD Zen 处理器上的（等效）未命中地址缓冲区 (MAB)。有关非临时指令和 LFB 的更多信息，请阅读这篇文章和这个。以下是修改后的代码，其中还包括循环展开优化，以在 nj 较小时加速代码：

double sum_column(uint64_t ni, uint64_t nj, double* const data)
{
    double sum0 = 0.0;
    double sum1 = 0.0;
    double sum2 = 0.0;
    double sum3 = 0.0;

    if(nj % 16 == 0)
    {
        // Cache-bypassing prefetch to avoid cache trashing
        const size_t distance = 12;
        for (uint64_t i = 0; i < ni; ++i) {
            _mm_prefetch(&data[(i+distance)*nj+0], _MM_HINT_NTA);
            sum0 += data[i*nj+0];
        }
    }
    else
    {
        // Unrolling is much better for small strides
        for (uint64_t i = 0; i < ni; i+=4) {
            sum0 += data[(i+0)*nj+0];
            sum1 += data[(i+1)*nj+0];
            sum2 += data[(i+2)*nj+0];
            sum3 += data[(i+3)*nj+0];
        }
    }
    
    return sum0 + sum1 + sum2 + sum3;
}

这是修改后的代码的结果：

我们可以看到峰不再出现在时间安排。我们还可以看到，由于 dt0 小了大约 4 倍（由于循环展开），这些值要大得多。

请注意，在实践中，这种方法并不能避免二级缓存中的缓存垃圾（至少在英特尔处理器上）。这意味着在我的机器上，效果仍然存在，nj 步幅为 512 (4 KiB) 的倍数（实际上比以前慢，尤其是当 nj >= 2048代码>）。当 (nj%512) == 0 && 时停止预取可能是个好主意。在 x86 处理器上 nj >= 512。效果 AFAIK，没有办法解决这个问题。话虽这么说，对非常大的数据结构执行如此大的跨步访问是一个非常糟糕的主意。

请注意，应仔细选择distance，因为早期预取可能会导致缓存行在实际使用之前被逐出（因此需要再次获取），而后期预取则没有多大用处。我认为使用接近 LFB/MAB 中条目数量的值是一个好主意（例如，Skylake/KabyLake/CannonLake 上为 12，Zen-2 上为 22）。

The plots show the combination of several complex low-level effects (mainly cache trashing & prefetching issues). I assume the target platform is a mainstream modern processor with cache lines of 64 bytes (typically a x86 one).

I can reproduce the problem on my i5-9600KF processor. Here is the resulting plot:

First of all, when nj is small, the gap between fetched address (ie. strides) is small and cache lines are relatively efficiently used. For example, when nj = 1, the access is contiguous. In this case, the processor can efficiently prefetch the cache lines from the DRAM so to hide its high latency. There is also a good spatial cache locality since many contiguous items share the same cache line. When nj=2, only half the value of a cache line is used. This means the number of requested cache line is twice bigger for the same number of operations. That being said the time is not much bigger due to the relatively high latency of adding two floating-point numbers resulting in a compute-bound code. You can unroll the loop 4 times and use 4 different sum variables so that (mainstream modern) processors can add multiple values in parallel. Note that most processors can also load multiple values from the cache per cycle. When nj = 4 a new cache line is requested every 2 cycles (since a double takes 8 bytes). As a result, the memory throughput can become so big that the computation becomes memory-bound. One may expect the time to be stable for nj >= 8 since the number of requested cache line should be the same, but in practice processors prefetch multiple contiguous cache lines so not to pay the overhead of the DRAM latency which is huge in this case. The number of prefetched cache lines is generally between 2 to 4 (AFAIK such prefetching strategy is disabled on Intel processors when the stride is bigger than 512, so when nj >= 64. This explains why the timings are sharply increasing when nj < 32 and they become relatively stable with 32 <= nj <= 256 with exceptions for peaks.

The regular peaks happening when nj is a multiple of 16 are due to a complex cache effect called cache thrashing. Modern cache are N-way associative with N typically between 4 and 16. For example, here are statistics on my i5-9600KF processors:

Cache 0: L1 data cache,        line size 64,  8-ways,    64 sets, size 32k 
Cache 1: L1 instruction cache, line size 64,  8-ways,    64 sets, size 32k 
Cache 2: L2 unified cache,     line size 64,  4-ways,  1024 sets, size 256k 
Cache 3: L3 unified cache,     line size 64, 12-ways, 12288 sets, size 9216k

This means that two fetched values from the DRAM with the respective address A1 and A2 can results in conflicts in my L1 cache if (A1 % 32768) / 64 == (A2 % 32768) / 64. In this case, the processor needs to choose which cache line to replace from a set of N=8 cache lines. There are many cache replacement policy and none is perfect. Thus, some useful cache line are sometime evicted too early resulting in additional cache misses required later. In pathological cases, many DRAM locations can compete for the same cache lines resulting in excessive cache misses. More information about this can be found also in this post.

Regarding the nj stride, the number of cache lines that can be effectively used in the L1 cache is limited. For example, if all fetched values have the same address modulus the cache size, then only N cache lines (ie. 8 for my processor) can actually be used to store all the values. Having less cache lines available is a big problem since the prefetcher need a pretty large space in the cache so to store the many cache lines needed later. The smaller the number of concurrent fetches, the lower memory throughput. This is especially true here since the latency of fetching 1 cache line from the DRAM is about several dozens of nanoseconds (eg. ~70 ns) while its bandwidth is about dozens of GiB/s (eg. ~40 GiB/s): dozens of cache lines (eg. ~40) should be fetched concurrently so to hide the latency and saturate the DRAM.

Here is the simulation of the number of cache lines that can be actually used in my L1 cache regarding the value of the nj:

 nj  #cache-lines
  1      512
  2      512
  3      512
  4      512
  5      512
  6      512
  7      512
  8      512
  9      512
 10      512
 11      512
 12      512
 13      512
 14      512
 15      512
 16      256    <----
 17      512
 18      512
 19      512
 20      512
 21      512
 22      512
 23      512
 24      512
 25      512
 26      512
 27      512
 28      512
 29      512
 30      512
 31      512
 32      128    <----
 33      512
 34      512
 35      512
 36      512
 37      512
 38      512
 39      512
 40      512
 41      512
 42      512
 43      512
 44      512
 45      512
 46      512
 47      512
 48      256    <----
 49      512
 50      512
 51      512
 52      512
 53      512
 54      512
 55      512
 56      512
 57      512
 58      512
 59      512
 60      512
 61      512
 62      512
 63      512
 64       64    <----
==============
 80      256
 96      128
112      256
128       32
144      256
160      128
176      256
192       64
208      256
224      128
240      256
256       16
384       32
512        8
1024       4

We can see that the number of available cache lines is smaller when nj is a multiple of 16. In this case, the prefetecher will preload data into cache lines that are likely evicted early by subsequent fetched (done concurrently). Loads instruction performed in the code are more likely to result in cache misses when the number of available cache line is small. When a cache miss happen, the value need then to be fetched again from the L2 or even the L3 resulting in a slower execution. Note that the L2 cache is also subject to the same effect though it is less visible since it is larger. The L3 cache of modern x86 processors makes use of hashing to better distributes things to reduce collisions from fixed strides (at least on Intel processors and certainly on AMD too though AFAIK this is not documented).

Here is the timings on my machine for some peaks:

  32 4.63600000e-03 4.62298020e-03 4.06400000e-03 4.97300000e-03
  48 4.95800000e-03 4.96994059e-03 4.60400000e-03 5.59800000e-03
  64 5.01600000e-03 5.00479208e-03 4.26900000e-03 5.33100000e-03
  96 4.99300000e-03 5.02284158e-03 4.94700000e-03 5.29700000e-03
 128 5.23300000e-03 5.26405941e-03 4.93200000e-03 5.85100000e-03
 192 4.76900000e-03 4.78833663e-03 4.60100000e-03 5.01600000e-03
 256 5.78500000e-03 5.81666337e-03 5.77600000e-03 6.35300000e-03
 384 5.25900000e-03 5.32504950e-03 5.22800000e-03 6.75800000e-03
 512 5.02700000e-03 5.05165347e-03 5.02100000e-03 5.34400000e-03
1024 5.29200000e-03 5.33059406e-03 5.28700000e-03 5.65700000e-03

As expected, the timings are overall bigger in practice for the case where the number of available cache lines is much smaller. However, when nj >= 512, the results are surprising since they are significantly faster than others. This is the case where the number of available cache lines is equal to the number of ways of associativity (N). My guess is that this is because Intel processors certainly detect this pathological case and optimize the prefetching so to reduce the number of cache misses (using line-fill buffers to bypass the L1 cache -- see below).

Finally, for large nj stride, a bigger nj should results in higher overheads mainly due to the translation lookaside buffer (TLB): there are more page addresses to translate with bigger nj and the number of TLB entries is limited. In fact this is what I can observe on my machine: timings tends to slowly increase in a very stable way unlike on your target platform.

I cannot really explain this very strange behavior yet.
Here is some wild guesses:

The OS could tend to uses more huge pages when nj is large (so to reduce de overhead of the TLB) since wider blocks are allocated. This could result in more concurrency for the prefetcher as AFAIK it cannot cross page
boundaries. You can try to check the number of allocated (transparent) huge-pages (by looking AnonHugePages in /proc/meminfo in Linux) or force them to be used in this case (using an explicit memmap), or possibly by disabling them. My system appears to make use of 2 MiB transparent huge-pages independently of the nj value.
If the target architecture is a NUMA one (eg. new AMD processors or a server with multiple processors having their own memory), then the OS could allocate pages physically stored on another NUMA node because there is less space available on the current NUMA node. This could result in higher performance due to the bigger throughput (though the latency is higher). You can control this policy with numactl on Linux so to force local allocations.

For more information about this topic, please read the great document What Every Programmer Should Know About Memory. Moreover, a very good post about how x86 cache works in practice is available here.

Removing the peaks

To remove the peaks due to cache trashing on x86 processors, you can use non-temporal software prefetching instructions so cache lines can be fetched in a non-temporal cache structure and into a location close to the processor that should not cause cache trashing in the L1 (if possible). Such cache structure is typically a line-fill buffers (LFB) on Intel processors and the (equivalent) miss address buffers (MAB) on AMD Zen processors. For more information about non-temporal instructions and the LFB, please read this post and this one. Here is the modified code that also include a loop unroling optimization to speed up the code when nj is small:

double sum_column(uint64_t ni, uint64_t nj, double* const data)
{
    double sum0 = 0.0;
    double sum1 = 0.0;
    double sum2 = 0.0;
    double sum3 = 0.0;

    if(nj % 16 == 0)
    {
        // Cache-bypassing prefetch to avoid cache trashing
        const size_t distance = 12;
        for (uint64_t i = 0; i < ni; ++i) {
            _mm_prefetch(&data[(i+distance)*nj+0], _MM_HINT_NTA);
            sum0 += data[i*nj+0];
        }
    }
    else
    {
        // Unrolling is much better for small strides
        for (uint64_t i = 0; i < ni; i+=4) {
            sum0 += data[(i+0)*nj+0];
            sum1 += data[(i+1)*nj+0];
            sum2 += data[(i+2)*nj+0];
            sum3 += data[(i+3)*nj+0];
        }
    }
    
    return sum0 + sum1 + sum2 + sum3;
}

Here is the result of the modified code:

We can see that peaks no longer appear in the timings. We can also see that the values are much bigger due to dt0 being about 4 times smaller (due to the loop unrolling).

Note that cache trashing in the L2 cache is not avoided with this method in practice (at least on Intel processors). This means that the effect is still here with huge nj strides multiple of 512 (4 KiB) on my machine (it is actually a slower than before, especially when nj >= 2048). It may be a good idea to stop the prefetching when (nj%512) == 0 && nj >= 512 on x86 processors. The effect AFAIK, there is no way to address this problem. That being said, this is a very bad idea to perform such big strided accesses on very-large data structures.

Note that distance should be carefully chosen since early prefetching can result cache line being evicted before they are actually used (so they need to be fetched again) and late prefetching is not much useful. I think using value close to the number of entries in the LFB/MAB is a good idea (eg. 12 on Skylake/KabyLake/CannonLake, 22 on Zen-2).

回复收藏 0 原文

~没有更多了~