C++对于大型数据集(数百万行)来说,最好的排序容器和方法是什么
我正在处理一个练习,该练习应该准确地对此类代码的时间复杂度进行基准测试。
我正在处理的数据由像 hbFvMF,PZLmRb
这样的字符串对组成,每个字符串在数据集中出现两次,一次在位置 1 上,一次在位置 1 上位置2。因此第一个字符串将指向 zvEcqe,hbFvMF
例如,列表还在继续......
我已经能够生成代码,对这些最多 50k 对的数据集进行排序没有太大问题,大约需要 4-5 分钟。 10k 只需几秒钟即可完成排序。
问题是我的代码应该处理最多 500 万对的数据集。所以我想看看我还能做些什么。我将发布我的两次最佳尝试,第一个是向量,我认为我可以通过用 unsorted_map
替换 vector
来升级它,因为搜索时的时间复杂度更好,但对我来说令人惊讶的是,当我测试时,两个容器之间几乎没有区别。我不确定我解决问题的方法或我选择的容器是否导致排序时间过长...
尝试使用向量:
#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <map>
#include <unordered_map>
#include <stdio.h>
#include <vector>
#include <iterator>
#include <utility>
#include <functional>
#include <algorithm>
using namespace std;
template<typename T>
void search_bricks_backwards(string resume, vector<T>& vec, vector<string>& vec2) {
int index = 0;
int temp_index = 0;
while (true) {
if (index == vec.size()) {
vec2.insert(vec2.begin(), vec[temp_index].first);
cout << "end of backward search, exitting..." << endl;
break;
}
if (vec[index].second == resume) {
vec2.insert(vec2.begin(), resume);
resume = vec[index].first;
//vec.erase(vec.begin() + index);
temp_index = index;
index = 0;
}
index++;
}
}
template<typename T>
void search_bricks(string start, vector<T>& vec, vector<string>& vec2) {
int index = 0;
int temp_index = 0;
while (true) {
//cout << "iteration " << index << endl;
if (index == vec.size()) {
vec2.push_back(vec[temp_index].second);
cout << "all forward bricks sorted" << endl;
break;
}
if (vec[index].first == start) {
vec2.push_back(vec[index].first);
start = vec[index].second;
//vec.erase(vec.begin() + index);
temp_index = index;
index = 0;
}
index++;
}
search_bricks_backwards(vec[0].first, vec, vec2);
}
template<typename T>
void search_bricks_recursion(string start, vector<T>& vec, vector<string>& vec2) {
int index = 0;
for (const auto& pair : vec) {
//cout << "iteration " << index << endl;
if (pair.first == start) {
vec2.push_back(start);
cout << "found " << start << " and " << pair.first << endl;
search_bricks(pair.second, vec, vec2);
}
if (index + 1 == vec.size()) {
search_bricks_backwards(start, vec, vec2);
}
index++;
}
}
template<typename T>
void printVectorElements(vector<T>& vec)
{
for (auto i = 0; i < vec.size(); ++i) {
cout << "(" << vec.at(i).first << ","
<< vec.at(i).second << ")" << " ";
}
cout << endl;
}
vector<string> split(string s, string delimiter) {
size_t pos_start = 0, pos_end, delim_len = delimiter.length();
string token;
vector<string> res;
while ((pos_end = s.find(delimiter, pos_start)) != string::npos) {
token = s.substr(pos_start, pos_end - pos_start);
pos_start = pos_end + delim_len;
res.push_back(token);
}
res.push_back(s.substr(pos_start));
return res;
}
unordered_map<string, string> brick_to_map(string const& s)
{
unordered_map<string, string> m;
string key, val;
istringstream iss(s);
while (getline(getline(iss, key, ',') >> ws, val))
m[key] = val;
return m;
}
int main()
{
vector<pair<string, string>> bricks;
vector<string> sorted_bricks;
ifstream inFile;
inFile.open("input-pairs-50K.txt"); //open the input file
stringstream strStream;
strStream << inFile.rdbuf(); //read the file
string str = strStream.str(); //str holds the content of the file
//cout << str << endl;
istringstream iss(str);
for (string line; getline(iss, line); )
{
string delimiter = ",";
string s = line;
vector<string> v = split(s, delimiter);
string s1 = v.at(0);
string s2 = v.at(1);
bricks.push_back(make_pair(s1, s2));
}
search_bricks(bricks[0].second, bricks, sorted_bricks);
//display the results
for (auto i = sorted_bricks.begin(); i != sorted_bricks.end(); ++i)
cout << *i << " ";
}
尝试使用未排序的地图:
#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <map>
#include <unordered_map>
#include <stdio.h>
#include <vector>
#include <iterator>
#include <utility>
#include <functional>
#include <algorithm>
using namespace std;
void search_bricks_backwards(string resume, unordered_map<string, string> brick_map, vector<string>& vec2) {
typedef unordered_map<string, string>::value_type map_value_type;
while (true) {
unordered_map<string, string>::const_iterator got = find_if(brick_map.begin(), brick_map.end(), [&resume](const map_value_type& vt)
{ return vt.second == resume; }
);
if (got == brick_map.end()) {
vec2.insert(vec2.begin(), resume);
cout << "end of backward search, exitting..." << endl;
break;
}
//cout << "iteration " << index << endl;
else if (got->second == resume) {
vec2.insert(vec2.begin(), resume);
resume = got->first;
}
}
}
void search_bricks(string start, unordered_map<string, string> brick_map, vector<string>& vec2) {
typedef unordered_map<string, string>::value_type map_value_type;
while (true) {
unordered_map<string, string>::const_iterator got = find_if(brick_map.begin(), brick_map.end(), [&start](const map_value_type& vt)
{ return vt.first == start; }
);
if (got == brick_map.end()) {
vec2.push_back(start);
cout << "all forward bricks sorted" << endl;
break;
}
else if (got->first == start) {
vec2.push_back(start);
//cout << "found " << start << " and " << vec[index].first << endl;
start = got->second;
}
}
auto it = brick_map.begin();
search_bricks_backwards(it->first, brick_map, vec2);
}
template<typename T>
void printVectorElements(vector<T>& vec)
{
for (auto i = 0; i < vec.size(); ++i) {
cout << "(" << vec.at(i).first << ","
<< vec.at(i).second << ")" << " ";
}
cout << endl;
}
vector<string> split(string s, string delimiter) {
size_t pos_start = 0, pos_end, delim_len = delimiter.length();
string token;
vector<string> res;
while ((pos_end = s.find(delimiter, pos_start)) != string::npos) {
token = s.substr(pos_start, pos_end - pos_start);
pos_start = pos_end + delim_len;
res.push_back(token);
}
res.push_back(s.substr(pos_start));
return res;
}
int main()
{
unordered_map<string, string> bricks;
vector<string> sorted_bricks;
ifstream inFile;
inFile.open("input-pairs-50K.txt"); //open the input file
for (string line; getline(inFile, line); )
{
string delimiter = ",";
string s = line;
vector<string> v = split(s, delimiter);
string s1 = v.at(0);
string s2 = v.at(1);
bricks.insert(make_pair(s1, s2));
}
/*for (auto& x : bricks)
std::cout << x.first << "," << x.second << " ";*/
auto it = bricks.begin();
search_bricks(it->second, bricks, sorted_bricks);
// display results
for (auto i = sorted_bricks.begin(); i != sorted_bricks.end(); ++i)
cout << *i << " ";
}
我希望提高代码的时间复杂度,以便能够更有效地处理数据,如果有人可以建议我的代码或容器方面需要改进什么,我将非常感激。
I'm tackling a exercise which is supposed to exactly benchmark the time complexity of such code.
The data I'm handling is made up of pairs of strings like this hbFvMF,PZLmRb
, each string is present two times in the dataset, once on position 1 and once on position 2 . so the first string would point to zvEcqe,hbFvMF
for example and the list goes on....
I've been able to produce code which doesn't have much problem sorting these datasets up to 50k pairs, where it takes about 4-5 minutes. 10k gets sorted in a matter of seconds.
The problem is that my code is supposed to handle datasets of up to 5 million pairs. So I'm trying to see what more I can do. I will post my two best attempts, initial one with vectors, which I thought I could upgrade by replacing vector
with unsorted_map
because of the better time complexity when searching, but to my surprise, there was almost no difference between the two containers when I tested it. I'm not sure if my approach to the problem or the containers I'm choosing are causing the steep sorting times...
Attempt with vectors:
#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <map>
#include <unordered_map>
#include <stdio.h>
#include <vector>
#include <iterator>
#include <utility>
#include <functional>
#include <algorithm>
using namespace std;
template<typename T>
void search_bricks_backwards(string resume, vector<T>& vec, vector<string>& vec2) {
int index = 0;
int temp_index = 0;
while (true) {
if (index == vec.size()) {
vec2.insert(vec2.begin(), vec[temp_index].first);
cout << "end of backward search, exitting..." << endl;
break;
}
if (vec[index].second == resume) {
vec2.insert(vec2.begin(), resume);
resume = vec[index].first;
//vec.erase(vec.begin() + index);
temp_index = index;
index = 0;
}
index++;
}
}
template<typename T>
void search_bricks(string start, vector<T>& vec, vector<string>& vec2) {
int index = 0;
int temp_index = 0;
while (true) {
//cout << "iteration " << index << endl;
if (index == vec.size()) {
vec2.push_back(vec[temp_index].second);
cout << "all forward bricks sorted" << endl;
break;
}
if (vec[index].first == start) {
vec2.push_back(vec[index].first);
start = vec[index].second;
//vec.erase(vec.begin() + index);
temp_index = index;
index = 0;
}
index++;
}
search_bricks_backwards(vec[0].first, vec, vec2);
}
template<typename T>
void search_bricks_recursion(string start, vector<T>& vec, vector<string>& vec2) {
int index = 0;
for (const auto& pair : vec) {
//cout << "iteration " << index << endl;
if (pair.first == start) {
vec2.push_back(start);
cout << "found " << start << " and " << pair.first << endl;
search_bricks(pair.second, vec, vec2);
}
if (index + 1 == vec.size()) {
search_bricks_backwards(start, vec, vec2);
}
index++;
}
}
template<typename T>
void printVectorElements(vector<T>& vec)
{
for (auto i = 0; i < vec.size(); ++i) {
cout << "(" << vec.at(i).first << ","
<< vec.at(i).second << ")" << " ";
}
cout << endl;
}
vector<string> split(string s, string delimiter) {
size_t pos_start = 0, pos_end, delim_len = delimiter.length();
string token;
vector<string> res;
while ((pos_end = s.find(delimiter, pos_start)) != string::npos) {
token = s.substr(pos_start, pos_end - pos_start);
pos_start = pos_end + delim_len;
res.push_back(token);
}
res.push_back(s.substr(pos_start));
return res;
}
unordered_map<string, string> brick_to_map(string const& s)
{
unordered_map<string, string> m;
string key, val;
istringstream iss(s);
while (getline(getline(iss, key, ',') >> ws, val))
m[key] = val;
return m;
}
int main()
{
vector<pair<string, string>> bricks;
vector<string> sorted_bricks;
ifstream inFile;
inFile.open("input-pairs-50K.txt"); //open the input file
stringstream strStream;
strStream << inFile.rdbuf(); //read the file
string str = strStream.str(); //str holds the content of the file
//cout << str << endl;
istringstream iss(str);
for (string line; getline(iss, line); )
{
string delimiter = ",";
string s = line;
vector<string> v = split(s, delimiter);
string s1 = v.at(0);
string s2 = v.at(1);
bricks.push_back(make_pair(s1, s2));
}
search_bricks(bricks[0].second, bricks, sorted_bricks);
//display the results
for (auto i = sorted_bricks.begin(); i != sorted_bricks.end(); ++i)
cout << *i << " ";
}
Attempt with unsorted map
:
#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <map>
#include <unordered_map>
#include <stdio.h>
#include <vector>
#include <iterator>
#include <utility>
#include <functional>
#include <algorithm>
using namespace std;
void search_bricks_backwards(string resume, unordered_map<string, string> brick_map, vector<string>& vec2) {
typedef unordered_map<string, string>::value_type map_value_type;
while (true) {
unordered_map<string, string>::const_iterator got = find_if(brick_map.begin(), brick_map.end(), [&resume](const map_value_type& vt)
{ return vt.second == resume; }
);
if (got == brick_map.end()) {
vec2.insert(vec2.begin(), resume);
cout << "end of backward search, exitting..." << endl;
break;
}
//cout << "iteration " << index << endl;
else if (got->second == resume) {
vec2.insert(vec2.begin(), resume);
resume = got->first;
}
}
}
void search_bricks(string start, unordered_map<string, string> brick_map, vector<string>& vec2) {
typedef unordered_map<string, string>::value_type map_value_type;
while (true) {
unordered_map<string, string>::const_iterator got = find_if(brick_map.begin(), brick_map.end(), [&start](const map_value_type& vt)
{ return vt.first == start; }
);
if (got == brick_map.end()) {
vec2.push_back(start);
cout << "all forward bricks sorted" << endl;
break;
}
else if (got->first == start) {
vec2.push_back(start);
//cout << "found " << start << " and " << vec[index].first << endl;
start = got->second;
}
}
auto it = brick_map.begin();
search_bricks_backwards(it->first, brick_map, vec2);
}
template<typename T>
void printVectorElements(vector<T>& vec)
{
for (auto i = 0; i < vec.size(); ++i) {
cout << "(" << vec.at(i).first << ","
<< vec.at(i).second << ")" << " ";
}
cout << endl;
}
vector<string> split(string s, string delimiter) {
size_t pos_start = 0, pos_end, delim_len = delimiter.length();
string token;
vector<string> res;
while ((pos_end = s.find(delimiter, pos_start)) != string::npos) {
token = s.substr(pos_start, pos_end - pos_start);
pos_start = pos_end + delim_len;
res.push_back(token);
}
res.push_back(s.substr(pos_start));
return res;
}
int main()
{
unordered_map<string, string> bricks;
vector<string> sorted_bricks;
ifstream inFile;
inFile.open("input-pairs-50K.txt"); //open the input file
for (string line; getline(inFile, line); )
{
string delimiter = ",";
string s = line;
vector<string> v = split(s, delimiter);
string s1 = v.at(0);
string s2 = v.at(1);
bricks.insert(make_pair(s1, s2));
}
/*for (auto& x : bricks)
std::cout << x.first << "," << x.second << " ";*/
auto it = bricks.begin();
search_bricks(it->second, bricks, sorted_bricks);
// display results
for (auto i = sorted_bricks.begin(); i != sorted_bricks.end(); ++i)
cout << *i << " ";
}
I'm looking to improve the time complexity of my code to be able to process the data more eficiently, if anyone can suggest what to improve in my code or container wise I'd be very thankful.
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(3)
首先,以目的为导向,类比一下这里真正所做的事情。这个问题有时会出现在面试问题中。它通常被表述为一组巴士票:
例如,其中
Sn
是 s 公交车站 ID,Sn->Sm
表示从车站Sn
到车站Sm 的行程
。给定以下四张票,覆盖五个车站,没有特定的顺序:正确的顺序可以这样想:
因此,正确的“排序”顺序是
算法分析
find
成员是让它们勾选(并发光)的原因。如果你想发出这种尖叫声,你需要使用键控来做到这一点。std::vector
的高昂费用,这可能会变得非常昂贵。向量是连续的连续存储。附加到它们并没有那么糟糕,因为它们的子分配器往往会在扩展时过度分配,以便在必须重新分配之前为更多的推迟留出空间。但是前置它们是可怕的。我扫描了您提供的源测试。数据中实际上有 49999 行,而不是 50000 行。经过一番尝试、错误、绞尽脑汁等之后,我发现:
bWyUVV
仅出现一次,作为左侧。EZkYGM
在文件中仅出现一次,作为右侧这些必须是排序列表的终结符。如果一切顺利并且数据是原始的,最终列表将以
bWyUVV
开头,并以EZkYGM
结尾 对于编写算法没有帮助,但对于验证我们所做的绝对有帮助正确的事情。全面的性能改进
在保留基本前提的同时,删除很多代码,请考虑以下几点:
最显着的差异:
使用
std ::deque
作为排序结果。您为这些矢量展示位置付出了高昂的代价。所有向量操作要么在后面推动(对于保留向量来说可以),要么在前面推动(向量中的性能很糟糕)。std::deque
专门用于非常快速的前后插入和修剪。我们不做后者,而是大量做前者。两个映射,分别键控 s1==>s2 和 s2==>s1。有第三方容器可以为您执行此操作(例如: boost::bimap)。然而,为此,考虑到密钥大小,仅存储两个映射很容易。
在适当的情况下使用移动语义(尽管不多)
在搜索过程中,每个发现的键都会从刚刚搜索的地图中删除。这确保我们在应该停止的时候停止,并相当快地恢复内存占用。
适用时参考参数。这些容器在构建结果集时被有效地销毁,但没关系(实际上可以保证正确检测一端或另一端的终端)。
双键操作带来了天壤之别。使用您的测试数据集在一台四年前的小型 MacBook Pro 笔记本电脑上构建的发布版本会产生以下结果(您可以通过已知答案测试来验证)。
所有代码都是使用 -O2 优化构建的,运行 Apple clang 版本 13.0.0 (clang-1300.0.29.30)
std::unordered_map
该时序图相当一致,尽管有在我的测试中,异常值高达 42 毫秒,低至 28 毫秒。使用常规
std::map
的相同线束会产生:std::map
因此,您肯定使用了带有无序容器的正确容器。键控,您实际上并没有将键控用于算法的重要部分,使得容器基本上不比顺序扫描更好。这与您的评估相符,它确实比矢量顺序扫描解决方案好不了多少;不管怎样,你基本上都是这么做的。
我怀疑上面显示的性能应该能够运行您预期的 500 万对,只要您可以将其全部保留在内存中(两次,抱歉,但结果非常坦率地表明它值得这个价格)。
内存更友好(至少有一点)
以下代码执行相同的操作,但没有添加所有输出,并使用单个被调用函数以及飞行中转置的地图。这从一开始就减少了对两张地图的需求。整体性能与上面的代码进行比较;只是一个不同的、更内存友好的实现。
First a purpose-driven analogy to what is really being done here. This problem sometimes comes up in interview questions. It is often phrased as a cluster of bus tickets:
An example, where
Sn
is s bus station ID, andSn->Sm
denotes a trip from stationSn
to stationSm
. Given the following four tickets covering five stations in no particular order:the proper order can be thought of like this:
And therefore, the correct "sorted" order is
Algorithm Analysis
find
member is what makes them tick (and shine). If you want to make this scream, you need to use keying to do so.std::vector
, which can get ridiculously expensive fast. Vectors a sequential contiguous storage. Appending to them is not quite as bad because their sub-allocators tend over-allocate on expansion to leave room for a few more push-backs before having to reallocate. But prepending to them is dreadful.I scanned the source test you provided. There are actually 49999 rows in the data, not 50000. After some trial, error, head scratching, etc., I discovered this:
bWyUVV
appears only once, as a left side.EZkYGM
appears in the file only once, as a right sideThese must be the terminals of the sorted list. If everything plays out and the data is pristine, the final list will start with
bWyUVV
, and end withEZkYGM
Not helpful for writing the algorithm, but definitely helpful for validating we did something right.Performance Improvements All Around
Stripping a lot of code out of this while still keeping the basic premise, consider the following:
The most notable differences:
Using
std::deque<std::string>
as the sorted results. You were paying a dear price for those vector placements. All of your vector operations were either pushing on the back (ok for reserved vectors) or pushing on the front (dreadful performance in vectors). Thestd::deque
is specialized for very fast front and back insertion and pruning. We're not doing the latter, but are heavily doing the former.Two maps, keying s1==>s2 and s2==>s1 respectively. there are third party containers that do this for you (ex: boost::bimap). For this, however, considering the key sizes, just storing two maps is easy.
Move semantics are used (though not much) where appropriate
During searches, each discovered key is deleted from the map just-searched. This ensures we stop when we're supposed to, and recovers memory footprint fairly quickly.
Reference parameters where applicable. These containers are effectively destroyed while building the result set, but that's ok (and in fact warranted to properly detect a terminal on one end or the other).
The double-keying makes a world of difference. A release build using your test data set on a puny little four-year-old macbook pro laptop produces the following (which you can verify with your know-answer tests).
All code is built with -O2 optimization running Apple clang version 13.0.0 (clang-1300.0.29.30)
std::unordered_map
That timing figure is fairly consistent, though there were outliers as high as 42ms and as low as 28ms in my testing. The same harness using a regular
std::map
resulted in:std::map
So you're definitely using the right container with an unordered keying, you just weren't actually using the keying for a non-trivial portion of your algorithm, making the container basically no better than a sequential scan. That jives with your assessment it really wasn't any better than a vector sequential scan solution; you were basically doing that regardless.
I suspect performance shown above should be able to run sets of your expected 5-million pairs, so long as you can keep it all in memory (twice, sorry about that, but the results show pretty candidly that it is worth that price).
More Memory Friendly (at least a little)
The following does the same thing, but without all the added output, and uses a single called function with the map transposed mid-flight. This alleviates the need for two maps from inception. The overall performance is comparative to the code above; just a different, more memory friendly, implementation.
您可以使用 trie 数据结构,这里有一篇论文解释了执行此操作的算法: https://people.eng.unimelb.edu.au/jzobel/fulltext/acsc03sz.pdf
但是你必须从头开始实现 trie,因为据我所知知道 C++ 中没有默认的 trie 实现。
You can use a trie data structure, here's a paper that explains an algorithm to do that: https://people.eng.unimelb.edu.au/jzobel/fulltext/acsc03sz.pdf
But you have to implement the trie from scratch because as far as I know there is no default trie implementation in c++.
回复较晚,但通过微小的改进,我们可以将处理时间加快 100%。或者将运行时间减少一半。
基本上 WhozCraig 选择的方法已经接近最佳。
通过使用 std::string_view 并选择不同的字符串分割方法,我可以将性能提高近 100%。大部分时间消耗在读取源文件和分割数据上。通过我选择的不同方法,我可以节省最多的时间。我添加了一些额外的小改进。
我用一个包含 5.000.000 对的大文件检查了这两种算法。我创建了一个模块来生成这样的测试文件。请查看下面的源代码。
我将展示我的源代码,给出的代码是“WhozCraig”,针对基准测试稍作修改。
但首先,请以屏幕截图形式查看基准测试结果。
使用问题中给出的原始小文件进行测试。我的解决方案:
使用小测试文件的“WhozCraig”解决方案
然后我创建了一个 5M 对测试文件并运行相同的测试。我的方法
使用大测试文件的“WhozCraig”解决方案
请注意。使用“WhozCraig”的方法对大数据进行排序甚至更快。
但阅读速度较慢。
我没有调查原因。
当然,所有这些都取决于机器和编译器。我在一台已有 10 年历史的 Windows 7 机器上运行了测试。编译器是Visual Studio 2019社区版。以发布模式编译。
请参阅下面的代码示例:
“WhozCraig”的解决方案,用于附加运行时间测量并用于测试。
在用户“subspring”的附加回答中,提出了一个特里树。
我认为这是不可行的。查看潜在的内存消耗。
我们有大写和小写字符。所以,总共 52 个字符。对于每个我们都需要一个 4 字节指针 + 1 个相关字符串的指针。那是
53*4 用于特里树的一层。字符串长度为 6。我们需要尝试 2 次。因此,经典树的总体内存消耗为:
((52+1)*4) ^ 6 * 2 = 181'570'446'368'768 ~182 TB。
与这个数字相比,我电脑中的 128GB RAM 小得离谱。
因此,我使用
std:unordered_map
作为链接元素实现了一个具有空间优化设计的 trie。这样我就可以将 RAM 内存消耗减少到 25GB。
但这里的结果也令人失望。
就算是有好意,也至少慢了三倍。 。 。
请参阅示例代码:
Late answer, but with minor improvements we can speed up the processing time by 100%. Or reduce the runtime by half.
Basically the approach chosen by WhozCraig is already nearly optimal.
I could improve the performance by nearly 100%, by using
std::string_view
and selecting a different string split approach. Most the time is consumed with reading the source file and splitting the data. With my different selected approach, I could save the most time. I added some additional small improvements.I checked both algorithms with a big file with 5.000.000 pairs. I created a module, to generate such a test file. Please have a look in the below source code.
I will show my source code and the code given be "WhozCraig", slighty modified for benchmark measurements.
But first, please see the benchmark results as screenshots.
Test with the original small file given in the quesion. My solution:
Solution of "WhozCraig" with small test file
Then I created a 5M pairs test file and run the same test. My approach
Solution of "WhozCraig" with big test file
Please note. Sorting big data with "WhozCraig"'s approach is even faster.
But reading is slower.
I did not investigate why.
Of course, all this is machine and compiler dependent. I ran the test on a 10 years old Windows 7 machine. The compiler is Visual Studio 2019 community editions. Compiled in release mode.
Please see code examples below:
Solution of "WhozCraig", instrumented for aditional runtime measurement and used for the tests.
In an additional answer from user "subspring", a trie was proposed.
I think it is not feasible. Look at the potential memory consumption.
We have upper- and lowercase characters. So, overall 52 characters. For each we would need a 4 byte pointer + 1 pointer for related string. That is
53*4 for one level of the trie. The string length is 6. And we need 2 tries. So, the overall memory consumption for a classical tree would be:
((52+1)*4) ^ 6 * 2 = 181'570'446'368'768 ~182 TB.
The 128GB RAM that is in my computer is rediculous small compared to this number.
For that reason I implemented a trie with a space optimized design using
std:unordered_map
s as link element.With that I could reduce the RAM memory consumption to 25GB.
But also here the result was disappointing.
Even with good will, it was at least 3 times slower . . .
See the example code: