内存延迟是否会减慢大型数据集的执行速度?

发布于 2025-01-10 18:09:33 字数 9343 浏览 0 评论 0原文

我用这段代码(用 GCC 在 -O3 中编译)来玩 Google Benchmark 很有趣:

#include <iostream>
#include <thread>
#include <random>
#include <benchmark/benchmark.h>

using namespace std;


struct PointRecord{
    double latitude, longitude;
    double speed_x, speed_y, speed_z;
    double acceleration_x, acceleration_y, acceleration_z;
    long timestamp;
};


struct PointGpsRecord{
    double latitude, longitude;
};


tuple<double, double> avg_pos_obj_oriented(vector<PointRecord>& data){
    double mean_lon = 0.0;
    double mean_lat = 0.0;
    auto size = data.size();
    for(auto& point : data){
        mean_lat += point.latitude;
        mean_lon += point.longitude;
    }
    return make_tuple(mean_lat / size, mean_lon / size);
}

static void BM_object_oriented(benchmark::State &state) {
    random_device rnd_device;
    mt19937 rng{rnd_device()};
    uniform_real_distribution<double> dist{-10, 10};

    int size = state.range(0);
    vector<PointRecord> points;
    points.reserve(size);
    for(int i = 0; i < size; i++){
        auto point = PointRecord();
        point.latitude = dist(rng);
        point.longitude = dist(rng);
        point.timestamp = 12345678;
        points.push_back(point);
    }


    for (auto _: state) {
        auto p = avg_pos_obj_oriented(points);
        benchmark::DoNotOptimize(p);
    }

    state.SetBytesProcessed(int64_t(state.iterations()) *
                            int64_t(sizeof(double)*2*size));
    state.counters["BytesProcessed"] = benchmark::Counter(size*sizeof(PointRecord), benchmark::Counter::kDefaults, benchmark::Counter::OneK::kIs1024);

}


tuple<double,double> avg_vec_data_oriented(vector<PointGpsRecord>& data, uint size){
    double sum_lat = 0.0;
    double sum_lon = 0.0;
    for(int i = 0; i < size; i++){
        sum_lat += data[i].latitude;
        sum_lon += data[i].longitude;
    }
    return make_tuple(sum_lat/size, sum_lon/size);
}

static void BM_data_oriented(benchmark::State &state) {
    random_device rnd_device;
    mt19937 rng{rnd_device()};
    uniform_real_distribution<double> dist{-10, 10};

    int size = state.range(0);

    vector<PointGpsRecord> records;
    records.reserve(size);

    for(int i = 0; i < size; i++){
        auto point = PointGpsRecord();
        point.latitude = dist(rng);
        point.longitude = dist(rng);
        records.push_back(point);
    }


    for (auto _: state) {
        tuple<double,double> avg = avg_vec_data_oriented(records, size);
        benchmark::DoNotOptimize(avg);
    }

    int64_t iterations = state.iterations();
    int64_t bytes = sizeof(double) * 2 * size;

    state.SetBytesProcessed(iterations * bytes);
    state.counters["BytesProcessed"] = benchmark::Counter(size*sizeof(PointGpsRecord), benchmark::Counter::kDefaults, benchmark::Counter::OneK::kIs1024);

}


BENCHMARK(BM_object_oriented)->Unit(benchmark::kMicrosecond)->RangeMultiplier(4)->Range(64, 1 << 25);
BENCHMARK(BM_data_oriented)->Unit(benchmark::kMicrosecond)->RangeMultiplier(4)->Range(64, 1 << 25);


BENCHMARK_MAIN();

结果如下:

Run on (16 X 5000 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x8)
  L1 Instruction 32 KiB (x8)
  L2 Unified 256 KiB (x8)
  L3 Unified 16384 KiB (x1)
Load Average: 1.29, 1.01, 0.84
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
--------------------------------------------------------------------------------------
Benchmark                            Time             CPU   Iterations UserCounters...
--------------------------------------------------------------------------------------
BM_object_oriented/64            0.039 us        0.039 us     17779654 BytesProcessed=4.5k bytes_per_second=24.5771G/s
BM_object_oriented/256           0.213 us        0.213 us      3300654 BytesProcessed=18k bytes_per_second=17.8812G/s
BM_object_oriented/1024          0.906 us        0.906 us       770645 BytesProcessed=72k bytes_per_second=16.8411G/s
BM_object_oriented/4096           3.84 us         3.84 us       180761 BytesProcessed=288k bytes_per_second=15.902G/s
BM_object_oriented/16384          18.8 us         18.8 us        37463 BytesProcessed=1.125M bytes_per_second=13.0112G/s
BM_object_oriented/65536          75.1 us         75.1 us         9406 BytesProcessed=4.5M bytes_per_second=13.008G/s
BM_object_oriented/262144          529 us          529 us         1355 BytesProcessed=18M bytes_per_second=7.37992G/s
BM_object_oriented/1048576        3072 us         3072 us          226 BytesProcessed=72M bytes_per_second=5.08572G/s
BM_object_oriented/4194304       12958 us        12957 us           55 BytesProcessed=288M bytes_per_second=4.82362G/s
BM_object_oriented/16777216      52470 us        52467 us           13 BytesProcessed=1.125G bytes_per_second=4.76492G/s
BM_object_oriented/33554432     104667 us       104662 us            7 BytesProcessed=2.25G bytes_per_second=4.77726G/s
BM_data_oriented/64              0.038 us        0.038 us     18196596 BytesProcessed=1024 bytes_per_second=25.0373G/s
BM_data_oriented/256             0.211 us        0.211 us      3312330 BytesProcessed=4k bytes_per_second=18.057G/s
BM_data_oriented/1024            0.898 us        0.898 us       776186 BytesProcessed=16k bytes_per_second=16.9891G/s
BM_data_oriented/4096             3.64 us         3.64 us       193013 BytesProcessed=64k bytes_per_second=16.7622G/s
BM_data_oriented/16384            14.6 us         14.6 us        47183 BytesProcessed=256k bytes_per_second=16.7451G/s
BM_data_oriented/65536            58.3 us         58.3 us        11894 BytesProcessed=1024k bytes_per_second=16.7614G/s
BM_data_oriented/262144            233 us          233 us         2970 BytesProcessed=4M bytes_per_second=16.7626G/s
BM_data_oriented/1048576          1131 us         1131 us          611 BytesProcessed=16M bytes_per_second=13.8116G/s
BM_data_oriented/4194304          4910 us         4910 us          145 BytesProcessed=64M bytes_per_second=12.7299G/s
BM_data_oriented/16777216        19468 us        19468 us           36 BytesProcessed=256M bytes_per_second=12.8415G/s
BM_data_oriented/33554432        39500 us        39497 us           17 BytesProcessed=512M bytes_per_second=12.6591G/s

因此,我们可以观察到随着各层缓存的填充,数据处理速率在并发情况下下降。

我认为,有了不同的缓存,CPU 在增加大小时可以通过预取数据来维持更好的速率。所以我有一个问题:RAM 肯定是这里的瓶颈吗?

有什么办法可以加快数据速率(还是大数据)?

那么,使用某种 CPU 并行化是否没有用,因为内核仍然得不到供电?

编辑:另外,这里是 OpenMP 并行版本及其结果:

tuple<double,double> avg_vec_data_oriented_parallel(vector<PointGpsRecord>& data, uint size){
    double sum_lat = 0.0;
    double sum_lon = 0.0;

#pragma omp parallel for reduction(+:sum_lat, sum_lon)
    for(auto& p: data){
        sum_lat += p.latitude;
        sum_lon += p.longitude;
    }
    return make_tuple(sum_lat/size, sum_lon/size);
}

static void BM_data_oriented_parallel(benchmark::State &state) {
    random_device rnd_device;
    mt19937 rng{rnd_device()};
    uniform_real_distribution<double> dist{-10, 10};

    int size = state.range(0);

    vector<PointGpsRecord> records;
    records.reserve(size);

    for(int i = 0; i < size; i++){
        auto point = PointGpsRecord();
        point.latitude = dist(rng);
        point.longitude = dist(rng);
        records.push_back(point);
    }


    for (auto _: state) {
        tuple<double,double> avg = avg_vec_data_oriented_parallel(records, size);
        benchmark::DoNotOptimize(avg);
    }

    int64_t iterations = state.iterations();
    int64_t bytes = sizeof(PointGpsRecord) * size;

    state.SetBytesProcessed(iterations * bytes);
    state.counters["BytesProcessed"] = benchmark::Counter(bytes, benchmark::Counter::kDefaults, benchmark::Counter::OneK::kIs1024);

}



BM_data_oriented_parallel/64             5.39 us         5.38 us       148050 BytesProcessed=1024 bytes_per_second=181.657M/s
BM_data_oriented_parallel/256            4.25 us         4.24 us       154501 BytesProcessed=4k bytes_per_second=920.244M/s
BM_data_oriented_parallel/1024           4.43 us         4.42 us       161006 BytesProcessed=16k bytes_per_second=3.44957G/s
BM_data_oriented_parallel/4096           5.60 us         5.38 us       100000 BytesProcessed=64k bytes_per_second=11.3523G/s
BM_data_oriented_parallel/16384          5.97 us         5.76 us       104429 BytesProcessed=256k bytes_per_second=42.3808G/s
BM_data_oriented_parallel/65536          11.2 us         11.0 us        65653 BytesProcessed=1024k bytes_per_second=89.0536G/s
BM_data_oriented_parallel/262144         31.9 us         31.7 us        18759 BytesProcessed=4M bytes_per_second=123.122G/s
BM_data_oriented_parallel/1048576         113 us          112 us         5247 BytesProcessed=16M bytes_per_second=139.253G/s
BM_data_oriented_parallel/4194304        1671 us         1661 us          433 BytesProcessed=64M bytes_per_second=37.6235G/s
BM_data_oriented_parallel/16777216       7411 us         7406 us           92 BytesProcessed=256M bytes_per_second=33.7556G/s
BM_data_oriented_parallel/33554432      15007 us        15005 us           47 BytesProcessed=512M bytes_per_second=33.3221G/s

请注意,当 L3 缓存已满时,它会达到峰值。

I was having fun with Google Benchmark with this code (compiled in -O3 with GCC):

#include <iostream>
#include <thread>
#include <random>
#include <benchmark/benchmark.h>

using namespace std;


struct PointRecord{
    double latitude, longitude;
    double speed_x, speed_y, speed_z;
    double acceleration_x, acceleration_y, acceleration_z;
    long timestamp;
};


struct PointGpsRecord{
    double latitude, longitude;
};


tuple<double, double> avg_pos_obj_oriented(vector<PointRecord>& data){
    double mean_lon = 0.0;
    double mean_lat = 0.0;
    auto size = data.size();
    for(auto& point : data){
        mean_lat += point.latitude;
        mean_lon += point.longitude;
    }
    return make_tuple(mean_lat / size, mean_lon / size);
}

static void BM_object_oriented(benchmark::State &state) {
    random_device rnd_device;
    mt19937 rng{rnd_device()};
    uniform_real_distribution<double> dist{-10, 10};

    int size = state.range(0);
    vector<PointRecord> points;
    points.reserve(size);
    for(int i = 0; i < size; i++){
        auto point = PointRecord();
        point.latitude = dist(rng);
        point.longitude = dist(rng);
        point.timestamp = 12345678;
        points.push_back(point);
    }


    for (auto _: state) {
        auto p = avg_pos_obj_oriented(points);
        benchmark::DoNotOptimize(p);
    }

    state.SetBytesProcessed(int64_t(state.iterations()) *
                            int64_t(sizeof(double)*2*size));
    state.counters["BytesProcessed"] = benchmark::Counter(size*sizeof(PointRecord), benchmark::Counter::kDefaults, benchmark::Counter::OneK::kIs1024);

}


tuple<double,double> avg_vec_data_oriented(vector<PointGpsRecord>& data, uint size){
    double sum_lat = 0.0;
    double sum_lon = 0.0;
    for(int i = 0; i < size; i++){
        sum_lat += data[i].latitude;
        sum_lon += data[i].longitude;
    }
    return make_tuple(sum_lat/size, sum_lon/size);
}

static void BM_data_oriented(benchmark::State &state) {
    random_device rnd_device;
    mt19937 rng{rnd_device()};
    uniform_real_distribution<double> dist{-10, 10};

    int size = state.range(0);

    vector<PointGpsRecord> records;
    records.reserve(size);

    for(int i = 0; i < size; i++){
        auto point = PointGpsRecord();
        point.latitude = dist(rng);
        point.longitude = dist(rng);
        records.push_back(point);
    }


    for (auto _: state) {
        tuple<double,double> avg = avg_vec_data_oriented(records, size);
        benchmark::DoNotOptimize(avg);
    }

    int64_t iterations = state.iterations();
    int64_t bytes = sizeof(double) * 2 * size;

    state.SetBytesProcessed(iterations * bytes);
    state.counters["BytesProcessed"] = benchmark::Counter(size*sizeof(PointGpsRecord), benchmark::Counter::kDefaults, benchmark::Counter::OneK::kIs1024);

}


BENCHMARK(BM_object_oriented)->Unit(benchmark::kMicrosecond)->RangeMultiplier(4)->Range(64, 1 << 25);
BENCHMARK(BM_data_oriented)->Unit(benchmark::kMicrosecond)->RangeMultiplier(4)->Range(64, 1 << 25);


BENCHMARK_MAIN();

With these results:

Run on (16 X 5000 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x8)
  L1 Instruction 32 KiB (x8)
  L2 Unified 256 KiB (x8)
  L3 Unified 16384 KiB (x1)
Load Average: 1.29, 1.01, 0.84
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
--------------------------------------------------------------------------------------
Benchmark                            Time             CPU   Iterations UserCounters...
--------------------------------------------------------------------------------------
BM_object_oriented/64            0.039 us        0.039 us     17779654 BytesProcessed=4.5k bytes_per_second=24.5771G/s
BM_object_oriented/256           0.213 us        0.213 us      3300654 BytesProcessed=18k bytes_per_second=17.8812G/s
BM_object_oriented/1024          0.906 us        0.906 us       770645 BytesProcessed=72k bytes_per_second=16.8411G/s
BM_object_oriented/4096           3.84 us         3.84 us       180761 BytesProcessed=288k bytes_per_second=15.902G/s
BM_object_oriented/16384          18.8 us         18.8 us        37463 BytesProcessed=1.125M bytes_per_second=13.0112G/s
BM_object_oriented/65536          75.1 us         75.1 us         9406 BytesProcessed=4.5M bytes_per_second=13.008G/s
BM_object_oriented/262144          529 us          529 us         1355 BytesProcessed=18M bytes_per_second=7.37992G/s
BM_object_oriented/1048576        3072 us         3072 us          226 BytesProcessed=72M bytes_per_second=5.08572G/s
BM_object_oriented/4194304       12958 us        12957 us           55 BytesProcessed=288M bytes_per_second=4.82362G/s
BM_object_oriented/16777216      52470 us        52467 us           13 BytesProcessed=1.125G bytes_per_second=4.76492G/s
BM_object_oriented/33554432     104667 us       104662 us            7 BytesProcessed=2.25G bytes_per_second=4.77726G/s
BM_data_oriented/64              0.038 us        0.038 us     18196596 BytesProcessed=1024 bytes_per_second=25.0373G/s
BM_data_oriented/256             0.211 us        0.211 us      3312330 BytesProcessed=4k bytes_per_second=18.057G/s
BM_data_oriented/1024            0.898 us        0.898 us       776186 BytesProcessed=16k bytes_per_second=16.9891G/s
BM_data_oriented/4096             3.64 us         3.64 us       193013 BytesProcessed=64k bytes_per_second=16.7622G/s
BM_data_oriented/16384            14.6 us         14.6 us        47183 BytesProcessed=256k bytes_per_second=16.7451G/s
BM_data_oriented/65536            58.3 us         58.3 us        11894 BytesProcessed=1024k bytes_per_second=16.7614G/s
BM_data_oriented/262144            233 us          233 us         2970 BytesProcessed=4M bytes_per_second=16.7626G/s
BM_data_oriented/1048576          1131 us         1131 us          611 BytesProcessed=16M bytes_per_second=13.8116G/s
BM_data_oriented/4194304          4910 us         4910 us          145 BytesProcessed=64M bytes_per_second=12.7299G/s
BM_data_oriented/16777216        19468 us        19468 us           36 BytesProcessed=256M bytes_per_second=12.8415G/s
BM_data_oriented/33554432        39500 us        39497 us           17 BytesProcessed=512M bytes_per_second=12.6591G/s

So, we can observe data processing rate decreasing in concurrency with the filling of the various layer of caches.

I thought that, with various caches, the CPU could sustain a better rate when increasing the size, by prefetching the data. So I have some question: is the RAM definitely being the bottleneck here?

Is there any way to speed-up the data rate (still on big data)?

Then, would using some kind of CPU-parallelization not be useful, since the cores would still be not be fed?

EDIT: also, here it is a OpenMP parallel version and its results:

tuple<double,double> avg_vec_data_oriented_parallel(vector<PointGpsRecord>& data, uint size){
    double sum_lat = 0.0;
    double sum_lon = 0.0;

#pragma omp parallel for reduction(+:sum_lat, sum_lon)
    for(auto& p: data){
        sum_lat += p.latitude;
        sum_lon += p.longitude;
    }
    return make_tuple(sum_lat/size, sum_lon/size);
}

static void BM_data_oriented_parallel(benchmark::State &state) {
    random_device rnd_device;
    mt19937 rng{rnd_device()};
    uniform_real_distribution<double> dist{-10, 10};

    int size = state.range(0);

    vector<PointGpsRecord> records;
    records.reserve(size);

    for(int i = 0; i < size; i++){
        auto point = PointGpsRecord();
        point.latitude = dist(rng);
        point.longitude = dist(rng);
        records.push_back(point);
    }


    for (auto _: state) {
        tuple<double,double> avg = avg_vec_data_oriented_parallel(records, size);
        benchmark::DoNotOptimize(avg);
    }

    int64_t iterations = state.iterations();
    int64_t bytes = sizeof(PointGpsRecord) * size;

    state.SetBytesProcessed(iterations * bytes);
    state.counters["BytesProcessed"] = benchmark::Counter(bytes, benchmark::Counter::kDefaults, benchmark::Counter::OneK::kIs1024);

}



BM_data_oriented_parallel/64             5.39 us         5.38 us       148050 BytesProcessed=1024 bytes_per_second=181.657M/s
BM_data_oriented_parallel/256            4.25 us         4.24 us       154501 BytesProcessed=4k bytes_per_second=920.244M/s
BM_data_oriented_parallel/1024           4.43 us         4.42 us       161006 BytesProcessed=16k bytes_per_second=3.44957G/s
BM_data_oriented_parallel/4096           5.60 us         5.38 us       100000 BytesProcessed=64k bytes_per_second=11.3523G/s
BM_data_oriented_parallel/16384          5.97 us         5.76 us       104429 BytesProcessed=256k bytes_per_second=42.3808G/s
BM_data_oriented_parallel/65536          11.2 us         11.0 us        65653 BytesProcessed=1024k bytes_per_second=89.0536G/s
BM_data_oriented_parallel/262144         31.9 us         31.7 us        18759 BytesProcessed=4M bytes_per_second=123.122G/s
BM_data_oriented_parallel/1048576         113 us          112 us         5247 BytesProcessed=16M bytes_per_second=139.253G/s
BM_data_oriented_parallel/4194304        1671 us         1661 us          433 BytesProcessed=64M bytes_per_second=37.6235G/s
BM_data_oriented_parallel/16777216       7411 us         7406 us           92 BytesProcessed=256M bytes_per_second=33.7556G/s
BM_data_oriented_parallel/33554432      15007 us        15005 us           47 BytesProcessed=512M bytes_per_second=33.3221G/s

Note that it peaks when the L3 cache is full.

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。
列表为空,暂无数据
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文