内存延迟是否会减慢大型数据集的执行速度?
我用这段代码(用 GCC 在 -O3 中编译)来玩 Google Benchmark 很有趣:
#include <iostream>
#include <thread>
#include <random>
#include <benchmark/benchmark.h>
using namespace std;
struct PointRecord{
double latitude, longitude;
double speed_x, speed_y, speed_z;
double acceleration_x, acceleration_y, acceleration_z;
long timestamp;
};
struct PointGpsRecord{
double latitude, longitude;
};
tuple<double, double> avg_pos_obj_oriented(vector<PointRecord>& data){
double mean_lon = 0.0;
double mean_lat = 0.0;
auto size = data.size();
for(auto& point : data){
mean_lat += point.latitude;
mean_lon += point.longitude;
}
return make_tuple(mean_lat / size, mean_lon / size);
}
static void BM_object_oriented(benchmark::State &state) {
random_device rnd_device;
mt19937 rng{rnd_device()};
uniform_real_distribution<double> dist{-10, 10};
int size = state.range(0);
vector<PointRecord> points;
points.reserve(size);
for(int i = 0; i < size; i++){
auto point = PointRecord();
point.latitude = dist(rng);
point.longitude = dist(rng);
point.timestamp = 12345678;
points.push_back(point);
}
for (auto _: state) {
auto p = avg_pos_obj_oriented(points);
benchmark::DoNotOptimize(p);
}
state.SetBytesProcessed(int64_t(state.iterations()) *
int64_t(sizeof(double)*2*size));
state.counters["BytesProcessed"] = benchmark::Counter(size*sizeof(PointRecord), benchmark::Counter::kDefaults, benchmark::Counter::OneK::kIs1024);
}
tuple<double,double> avg_vec_data_oriented(vector<PointGpsRecord>& data, uint size){
double sum_lat = 0.0;
double sum_lon = 0.0;
for(int i = 0; i < size; i++){
sum_lat += data[i].latitude;
sum_lon += data[i].longitude;
}
return make_tuple(sum_lat/size, sum_lon/size);
}
static void BM_data_oriented(benchmark::State &state) {
random_device rnd_device;
mt19937 rng{rnd_device()};
uniform_real_distribution<double> dist{-10, 10};
int size = state.range(0);
vector<PointGpsRecord> records;
records.reserve(size);
for(int i = 0; i < size; i++){
auto point = PointGpsRecord();
point.latitude = dist(rng);
point.longitude = dist(rng);
records.push_back(point);
}
for (auto _: state) {
tuple<double,double> avg = avg_vec_data_oriented(records, size);
benchmark::DoNotOptimize(avg);
}
int64_t iterations = state.iterations();
int64_t bytes = sizeof(double) * 2 * size;
state.SetBytesProcessed(iterations * bytes);
state.counters["BytesProcessed"] = benchmark::Counter(size*sizeof(PointGpsRecord), benchmark::Counter::kDefaults, benchmark::Counter::OneK::kIs1024);
}
BENCHMARK(BM_object_oriented)->Unit(benchmark::kMicrosecond)->RangeMultiplier(4)->Range(64, 1 << 25);
BENCHMARK(BM_data_oriented)->Unit(benchmark::kMicrosecond)->RangeMultiplier(4)->Range(64, 1 << 25);
BENCHMARK_MAIN();
结果如下:
Run on (16 X 5000 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x8)
L1 Instruction 32 KiB (x8)
L2 Unified 256 KiB (x8)
L3 Unified 16384 KiB (x1)
Load Average: 1.29, 1.01, 0.84
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
--------------------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters...
--------------------------------------------------------------------------------------
BM_object_oriented/64 0.039 us 0.039 us 17779654 BytesProcessed=4.5k bytes_per_second=24.5771G/s
BM_object_oriented/256 0.213 us 0.213 us 3300654 BytesProcessed=18k bytes_per_second=17.8812G/s
BM_object_oriented/1024 0.906 us 0.906 us 770645 BytesProcessed=72k bytes_per_second=16.8411G/s
BM_object_oriented/4096 3.84 us 3.84 us 180761 BytesProcessed=288k bytes_per_second=15.902G/s
BM_object_oriented/16384 18.8 us 18.8 us 37463 BytesProcessed=1.125M bytes_per_second=13.0112G/s
BM_object_oriented/65536 75.1 us 75.1 us 9406 BytesProcessed=4.5M bytes_per_second=13.008G/s
BM_object_oriented/262144 529 us 529 us 1355 BytesProcessed=18M bytes_per_second=7.37992G/s
BM_object_oriented/1048576 3072 us 3072 us 226 BytesProcessed=72M bytes_per_second=5.08572G/s
BM_object_oriented/4194304 12958 us 12957 us 55 BytesProcessed=288M bytes_per_second=4.82362G/s
BM_object_oriented/16777216 52470 us 52467 us 13 BytesProcessed=1.125G bytes_per_second=4.76492G/s
BM_object_oriented/33554432 104667 us 104662 us 7 BytesProcessed=2.25G bytes_per_second=4.77726G/s
BM_data_oriented/64 0.038 us 0.038 us 18196596 BytesProcessed=1024 bytes_per_second=25.0373G/s
BM_data_oriented/256 0.211 us 0.211 us 3312330 BytesProcessed=4k bytes_per_second=18.057G/s
BM_data_oriented/1024 0.898 us 0.898 us 776186 BytesProcessed=16k bytes_per_second=16.9891G/s
BM_data_oriented/4096 3.64 us 3.64 us 193013 BytesProcessed=64k bytes_per_second=16.7622G/s
BM_data_oriented/16384 14.6 us 14.6 us 47183 BytesProcessed=256k bytes_per_second=16.7451G/s
BM_data_oriented/65536 58.3 us 58.3 us 11894 BytesProcessed=1024k bytes_per_second=16.7614G/s
BM_data_oriented/262144 233 us 233 us 2970 BytesProcessed=4M bytes_per_second=16.7626G/s
BM_data_oriented/1048576 1131 us 1131 us 611 BytesProcessed=16M bytes_per_second=13.8116G/s
BM_data_oriented/4194304 4910 us 4910 us 145 BytesProcessed=64M bytes_per_second=12.7299G/s
BM_data_oriented/16777216 19468 us 19468 us 36 BytesProcessed=256M bytes_per_second=12.8415G/s
BM_data_oriented/33554432 39500 us 39497 us 17 BytesProcessed=512M bytes_per_second=12.6591G/s
因此,我们可以观察到随着各层缓存的填充,数据处理速率在并发情况下下降。
我认为,有了不同的缓存,CPU 在增加大小时可以通过预取数据来维持更好的速率。所以我有一个问题:RAM 肯定是这里的瓶颈吗?
有什么办法可以加快数据速率(还是大数据)?
那么,使用某种 CPU 并行化是否没有用,因为内核仍然得不到供电?
编辑:另外,这里是 OpenMP 并行版本及其结果:
tuple<double,double> avg_vec_data_oriented_parallel(vector<PointGpsRecord>& data, uint size){
double sum_lat = 0.0;
double sum_lon = 0.0;
#pragma omp parallel for reduction(+:sum_lat, sum_lon)
for(auto& p: data){
sum_lat += p.latitude;
sum_lon += p.longitude;
}
return make_tuple(sum_lat/size, sum_lon/size);
}
static void BM_data_oriented_parallel(benchmark::State &state) {
random_device rnd_device;
mt19937 rng{rnd_device()};
uniform_real_distribution<double> dist{-10, 10};
int size = state.range(0);
vector<PointGpsRecord> records;
records.reserve(size);
for(int i = 0; i < size; i++){
auto point = PointGpsRecord();
point.latitude = dist(rng);
point.longitude = dist(rng);
records.push_back(point);
}
for (auto _: state) {
tuple<double,double> avg = avg_vec_data_oriented_parallel(records, size);
benchmark::DoNotOptimize(avg);
}
int64_t iterations = state.iterations();
int64_t bytes = sizeof(PointGpsRecord) * size;
state.SetBytesProcessed(iterations * bytes);
state.counters["BytesProcessed"] = benchmark::Counter(bytes, benchmark::Counter::kDefaults, benchmark::Counter::OneK::kIs1024);
}
BM_data_oriented_parallel/64 5.39 us 5.38 us 148050 BytesProcessed=1024 bytes_per_second=181.657M/s
BM_data_oriented_parallel/256 4.25 us 4.24 us 154501 BytesProcessed=4k bytes_per_second=920.244M/s
BM_data_oriented_parallel/1024 4.43 us 4.42 us 161006 BytesProcessed=16k bytes_per_second=3.44957G/s
BM_data_oriented_parallel/4096 5.60 us 5.38 us 100000 BytesProcessed=64k bytes_per_second=11.3523G/s
BM_data_oriented_parallel/16384 5.97 us 5.76 us 104429 BytesProcessed=256k bytes_per_second=42.3808G/s
BM_data_oriented_parallel/65536 11.2 us 11.0 us 65653 BytesProcessed=1024k bytes_per_second=89.0536G/s
BM_data_oriented_parallel/262144 31.9 us 31.7 us 18759 BytesProcessed=4M bytes_per_second=123.122G/s
BM_data_oriented_parallel/1048576 113 us 112 us 5247 BytesProcessed=16M bytes_per_second=139.253G/s
BM_data_oriented_parallel/4194304 1671 us 1661 us 433 BytesProcessed=64M bytes_per_second=37.6235G/s
BM_data_oriented_parallel/16777216 7411 us 7406 us 92 BytesProcessed=256M bytes_per_second=33.7556G/s
BM_data_oriented_parallel/33554432 15007 us 15005 us 47 BytesProcessed=512M bytes_per_second=33.3221G/s
请注意,当 L3 缓存已满时,它会达到峰值。
I was having fun with Google Benchmark with this code (compiled in -O3 with GCC):
#include <iostream>
#include <thread>
#include <random>
#include <benchmark/benchmark.h>
using namespace std;
struct PointRecord{
double latitude, longitude;
double speed_x, speed_y, speed_z;
double acceleration_x, acceleration_y, acceleration_z;
long timestamp;
};
struct PointGpsRecord{
double latitude, longitude;
};
tuple<double, double> avg_pos_obj_oriented(vector<PointRecord>& data){
double mean_lon = 0.0;
double mean_lat = 0.0;
auto size = data.size();
for(auto& point : data){
mean_lat += point.latitude;
mean_lon += point.longitude;
}
return make_tuple(mean_lat / size, mean_lon / size);
}
static void BM_object_oriented(benchmark::State &state) {
random_device rnd_device;
mt19937 rng{rnd_device()};
uniform_real_distribution<double> dist{-10, 10};
int size = state.range(0);
vector<PointRecord> points;
points.reserve(size);
for(int i = 0; i < size; i++){
auto point = PointRecord();
point.latitude = dist(rng);
point.longitude = dist(rng);
point.timestamp = 12345678;
points.push_back(point);
}
for (auto _: state) {
auto p = avg_pos_obj_oriented(points);
benchmark::DoNotOptimize(p);
}
state.SetBytesProcessed(int64_t(state.iterations()) *
int64_t(sizeof(double)*2*size));
state.counters["BytesProcessed"] = benchmark::Counter(size*sizeof(PointRecord), benchmark::Counter::kDefaults, benchmark::Counter::OneK::kIs1024);
}
tuple<double,double> avg_vec_data_oriented(vector<PointGpsRecord>& data, uint size){
double sum_lat = 0.0;
double sum_lon = 0.0;
for(int i = 0; i < size; i++){
sum_lat += data[i].latitude;
sum_lon += data[i].longitude;
}
return make_tuple(sum_lat/size, sum_lon/size);
}
static void BM_data_oriented(benchmark::State &state) {
random_device rnd_device;
mt19937 rng{rnd_device()};
uniform_real_distribution<double> dist{-10, 10};
int size = state.range(0);
vector<PointGpsRecord> records;
records.reserve(size);
for(int i = 0; i < size; i++){
auto point = PointGpsRecord();
point.latitude = dist(rng);
point.longitude = dist(rng);
records.push_back(point);
}
for (auto _: state) {
tuple<double,double> avg = avg_vec_data_oriented(records, size);
benchmark::DoNotOptimize(avg);
}
int64_t iterations = state.iterations();
int64_t bytes = sizeof(double) * 2 * size;
state.SetBytesProcessed(iterations * bytes);
state.counters["BytesProcessed"] = benchmark::Counter(size*sizeof(PointGpsRecord), benchmark::Counter::kDefaults, benchmark::Counter::OneK::kIs1024);
}
BENCHMARK(BM_object_oriented)->Unit(benchmark::kMicrosecond)->RangeMultiplier(4)->Range(64, 1 << 25);
BENCHMARK(BM_data_oriented)->Unit(benchmark::kMicrosecond)->RangeMultiplier(4)->Range(64, 1 << 25);
BENCHMARK_MAIN();
With these results:
Run on (16 X 5000 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x8)
L1 Instruction 32 KiB (x8)
L2 Unified 256 KiB (x8)
L3 Unified 16384 KiB (x1)
Load Average: 1.29, 1.01, 0.84
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
--------------------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters...
--------------------------------------------------------------------------------------
BM_object_oriented/64 0.039 us 0.039 us 17779654 BytesProcessed=4.5k bytes_per_second=24.5771G/s
BM_object_oriented/256 0.213 us 0.213 us 3300654 BytesProcessed=18k bytes_per_second=17.8812G/s
BM_object_oriented/1024 0.906 us 0.906 us 770645 BytesProcessed=72k bytes_per_second=16.8411G/s
BM_object_oriented/4096 3.84 us 3.84 us 180761 BytesProcessed=288k bytes_per_second=15.902G/s
BM_object_oriented/16384 18.8 us 18.8 us 37463 BytesProcessed=1.125M bytes_per_second=13.0112G/s
BM_object_oriented/65536 75.1 us 75.1 us 9406 BytesProcessed=4.5M bytes_per_second=13.008G/s
BM_object_oriented/262144 529 us 529 us 1355 BytesProcessed=18M bytes_per_second=7.37992G/s
BM_object_oriented/1048576 3072 us 3072 us 226 BytesProcessed=72M bytes_per_second=5.08572G/s
BM_object_oriented/4194304 12958 us 12957 us 55 BytesProcessed=288M bytes_per_second=4.82362G/s
BM_object_oriented/16777216 52470 us 52467 us 13 BytesProcessed=1.125G bytes_per_second=4.76492G/s
BM_object_oriented/33554432 104667 us 104662 us 7 BytesProcessed=2.25G bytes_per_second=4.77726G/s
BM_data_oriented/64 0.038 us 0.038 us 18196596 BytesProcessed=1024 bytes_per_second=25.0373G/s
BM_data_oriented/256 0.211 us 0.211 us 3312330 BytesProcessed=4k bytes_per_second=18.057G/s
BM_data_oriented/1024 0.898 us 0.898 us 776186 BytesProcessed=16k bytes_per_second=16.9891G/s
BM_data_oriented/4096 3.64 us 3.64 us 193013 BytesProcessed=64k bytes_per_second=16.7622G/s
BM_data_oriented/16384 14.6 us 14.6 us 47183 BytesProcessed=256k bytes_per_second=16.7451G/s
BM_data_oriented/65536 58.3 us 58.3 us 11894 BytesProcessed=1024k bytes_per_second=16.7614G/s
BM_data_oriented/262144 233 us 233 us 2970 BytesProcessed=4M bytes_per_second=16.7626G/s
BM_data_oriented/1048576 1131 us 1131 us 611 BytesProcessed=16M bytes_per_second=13.8116G/s
BM_data_oriented/4194304 4910 us 4910 us 145 BytesProcessed=64M bytes_per_second=12.7299G/s
BM_data_oriented/16777216 19468 us 19468 us 36 BytesProcessed=256M bytes_per_second=12.8415G/s
BM_data_oriented/33554432 39500 us 39497 us 17 BytesProcessed=512M bytes_per_second=12.6591G/s
So, we can observe data processing rate decreasing in concurrency with the filling of the various layer of caches.
I thought that, with various caches, the CPU could sustain a better rate when increasing the size, by prefetching the data. So I have some question: is the RAM definitely being the bottleneck here?
Is there any way to speed-up the data rate (still on big data)?
Then, would using some kind of CPU-parallelization not be useful, since the cores would still be not be fed?
EDIT: also, here it is a OpenMP parallel version and its results:
tuple<double,double> avg_vec_data_oriented_parallel(vector<PointGpsRecord>& data, uint size){
double sum_lat = 0.0;
double sum_lon = 0.0;
#pragma omp parallel for reduction(+:sum_lat, sum_lon)
for(auto& p: data){
sum_lat += p.latitude;
sum_lon += p.longitude;
}
return make_tuple(sum_lat/size, sum_lon/size);
}
static void BM_data_oriented_parallel(benchmark::State &state) {
random_device rnd_device;
mt19937 rng{rnd_device()};
uniform_real_distribution<double> dist{-10, 10};
int size = state.range(0);
vector<PointGpsRecord> records;
records.reserve(size);
for(int i = 0; i < size; i++){
auto point = PointGpsRecord();
point.latitude = dist(rng);
point.longitude = dist(rng);
records.push_back(point);
}
for (auto _: state) {
tuple<double,double> avg = avg_vec_data_oriented_parallel(records, size);
benchmark::DoNotOptimize(avg);
}
int64_t iterations = state.iterations();
int64_t bytes = sizeof(PointGpsRecord) * size;
state.SetBytesProcessed(iterations * bytes);
state.counters["BytesProcessed"] = benchmark::Counter(bytes, benchmark::Counter::kDefaults, benchmark::Counter::OneK::kIs1024);
}
BM_data_oriented_parallel/64 5.39 us 5.38 us 148050 BytesProcessed=1024 bytes_per_second=181.657M/s
BM_data_oriented_parallel/256 4.25 us 4.24 us 154501 BytesProcessed=4k bytes_per_second=920.244M/s
BM_data_oriented_parallel/1024 4.43 us 4.42 us 161006 BytesProcessed=16k bytes_per_second=3.44957G/s
BM_data_oriented_parallel/4096 5.60 us 5.38 us 100000 BytesProcessed=64k bytes_per_second=11.3523G/s
BM_data_oriented_parallel/16384 5.97 us 5.76 us 104429 BytesProcessed=256k bytes_per_second=42.3808G/s
BM_data_oriented_parallel/65536 11.2 us 11.0 us 65653 BytesProcessed=1024k bytes_per_second=89.0536G/s
BM_data_oriented_parallel/262144 31.9 us 31.7 us 18759 BytesProcessed=4M bytes_per_second=123.122G/s
BM_data_oriented_parallel/1048576 113 us 112 us 5247 BytesProcessed=16M bytes_per_second=139.253G/s
BM_data_oriented_parallel/4194304 1671 us 1661 us 433 BytesProcessed=64M bytes_per_second=37.6235G/s
BM_data_oriented_parallel/16777216 7411 us 7406 us 92 BytesProcessed=256M bytes_per_second=33.7556G/s
BM_data_oriented_parallel/33554432 15007 us 15005 us 47 BytesProcessed=512M bytes_per_second=33.3221G/s
Note that it peaks when the L3 cache is full.
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论