C++对于大型数据集(数百万行)来说,最好的排序容器和方法是什么

发布于 2025-01-09 05:25:15 字数 8963 浏览 0 评论 0原文

我正在处理一个练习,该练习应该准确地对此类代码的时间复杂度进行基准测试。

我正在处理的数据由像 hbFvMF,PZLmRb 这样的字符串对组成,每个字符串在数据集中出现两次,一次在位置 1 上,一次在位置 1 上位置2。因此第一个字符串将指向 zvEcqe,hbFvMF 例如,列表还在继续......

示例50k 对的数据集

我已经能够生成代码,对这些最多 50k 对的数据集进行排序没有太大问题,大约需要 4-5 分钟。 10k 只需几秒钟即可完成排序。

问题是我的代码应该处理最多 500 万对的数据集。所以我想看看我还能做些什么。我将发布我的两次最佳尝试,第一个是向量,我认为我可以通过用 unsorted_map 替换 vector 来升级它,因为搜索时的时间复杂度更好,但对我来说令人惊讶的是,当我测试时,两个容器之间几乎没有区别。我不确定我解决问题的方法或我选择的容器是否导致排序时间过长...

尝试使用向量:

#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <map>
#include <unordered_map>
#include <stdio.h>
#include <vector>
#include <iterator>
#include <utility>
#include <functional>
#include <algorithm>

using namespace std;


template<typename T>
void search_bricks_backwards(string resume, vector<T>& vec, vector<string>& vec2) {
    int index = 0;
    int temp_index = 0;
    while (true) {
        if (index == vec.size()) {
            vec2.insert(vec2.begin(), vec[temp_index].first); 
            cout << "end of backward search, exitting..." << endl;
            break;


        }
        
        if (vec[index].second == resume) {
            vec2.insert(vec2.begin(), resume);
            

            resume = vec[index].first;
            //vec.erase(vec.begin() + index);
            temp_index = index;

            index = 0;
        }
        
        index++;
    }

}


template<typename T>
void search_bricks(string start, vector<T>& vec, vector<string>& vec2) {
    int index = 0;
    int temp_index = 0;
    while (true) {
        //cout << "iteration " << index << endl;
        if (index == vec.size()) {
            vec2.push_back(vec[temp_index].second);
            
            cout << "all forward bricks sorted" << endl;
            break;


        }
        if (vec[index].first == start) {
            vec2.push_back(vec[index].first);
            
            
            start = vec[index].second;
            //vec.erase(vec.begin() + index);
            temp_index = index;
            index = 0;
            
        }
        
        index++;
    }

    search_bricks_backwards(vec[0].first, vec, vec2);

}

template<typename T>
void search_bricks_recursion(string start, vector<T>& vec, vector<string>& vec2) {
    int index = 0;
    for (const auto& pair : vec) {
        //cout << "iteration " << index << endl;
        if (pair.first == start) {
            vec2.push_back(start);
            cout << "found " << start << " and " << pair.first << endl;
            search_bricks(pair.second, vec, vec2);
        }
        if (index + 1 == vec.size()) {
            search_bricks_backwards(start, vec, vec2);
            

        }
        index++;
    }
    
}

template<typename T>
void printVectorElements(vector<T>& vec)
{
    for (auto i = 0; i < vec.size(); ++i) {
        cout << "(" << vec.at(i).first << ","
            << vec.at(i).second << ")" << " ";
    }
    cout << endl;
}

vector<string> split(string s, string delimiter) {
    size_t pos_start = 0, pos_end, delim_len = delimiter.length();
    string token;
    vector<string> res;

    while ((pos_end = s.find(delimiter, pos_start)) != string::npos) {
        token = s.substr(pos_start, pos_end - pos_start);
        pos_start = pos_end + delim_len;
        res.push_back(token);
    }

    res.push_back(s.substr(pos_start));
    return res;
}

unordered_map<string, string> brick_to_map(string const& s)
{
    unordered_map<string, string> m;

    string key, val;
    istringstream iss(s);

    while (getline(getline(iss, key, ',') >> ws, val))
        m[key] = val;

    return m;
}

int main()
{
    vector<pair<string, string>> bricks;
    
    vector<string> sorted_bricks;

    ifstream inFile;
    inFile.open("input-pairs-50K.txt"); //open the input file

    stringstream strStream;
    strStream << inFile.rdbuf(); //read the file
    string str = strStream.str(); //str holds the content of the file

    //cout << str << endl;
    
    istringstream iss(str);
    
    for (string line; getline(iss, line); )
    {
     
        string delimiter = ",";
        string s = line;
        vector<string> v = split(s, delimiter);
        string s1 = v.at(0);
        string s2 = v.at(1);
        

        bricks.push_back(make_pair(s1, s2));
    }

   
    search_bricks(bricks[0].second, bricks, sorted_bricks);
    
    
    //display the results
    for (auto i = sorted_bricks.begin(); i != sorted_bricks.end(); ++i)
        cout << *i << " ";


    
 
}

尝试使用未排序的地图:

#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <map>
#include <unordered_map>
#include <stdio.h>
#include <vector>
#include <iterator>
#include <utility>
#include <functional>
#include <algorithm>

using namespace std;


void search_bricks_backwards(string resume, unordered_map<string, string> brick_map, vector<string>& vec2) {
    
    typedef unordered_map<string, string>::value_type map_value_type;
    while (true) {

        unordered_map<string, string>::const_iterator got = find_if(brick_map.begin(), brick_map.end(), [&resume](const map_value_type& vt)
            { return vt.second == resume; }
        );
        if (got == brick_map.end()) {
            vec2.insert(vec2.begin(), resume); 
            cout << "end of backward search, exitting..." << endl;
            break;


        }
        //cout << "iteration " << index << endl;
        else if (got->second == resume) {
            vec2.insert(vec2.begin(), resume);

            
            resume = got->first;
        
        }

       
    }

}


void search_bricks(string start, unordered_map<string, string> brick_map, vector<string>& vec2) {
    
    typedef unordered_map<string, string>::value_type map_value_type;
    while (true) {
        

        unordered_map<string, string>::const_iterator got = find_if(brick_map.begin(), brick_map.end(), [&start](const map_value_type& vt)
            { return vt.first == start; }
        );
        if (got == brick_map.end()) {
            vec2.push_back(start);

            cout << "all forward bricks sorted" << endl;
            
            break;
        }
        else if (got->first == start) {
            vec2.push_back(start);

            //cout << "found " << start << " and " << vec[index].first << endl;
            start = got->second;
            
        }
    }
    auto it = brick_map.begin();
    search_bricks_backwards(it->first, brick_map, vec2);
    


    

    

}


template<typename T>
void printVectorElements(vector<T>& vec)
{
    for (auto i = 0; i < vec.size(); ++i) {
        cout << "(" << vec.at(i).first << ","
            << vec.at(i).second << ")" << " ";
    }
    cout << endl;
}

vector<string> split(string s, string delimiter) {
    size_t pos_start = 0, pos_end, delim_len = delimiter.length();
    string token;
    vector<string> res;

    while ((pos_end = s.find(delimiter, pos_start)) != string::npos) {
        token = s.substr(pos_start, pos_end - pos_start);
        pos_start = pos_end + delim_len;
        res.push_back(token);
    }

    res.push_back(s.substr(pos_start));
    return res;
}


int main()
{
    unordered_map<string, string> bricks;
    
    vector<string> sorted_bricks;

    ifstream inFile;
    inFile.open("input-pairs-50K.txt"); //open the input file

    for (string line; getline(inFile, line); )
    {

        string delimiter = ",";
        string s = line;
        vector<string> v = split(s, delimiter);
        string s1 = v.at(0);
        string s2 = v.at(1);


        bricks.insert(make_pair(s1, s2));
    }


    /*for (auto& x : bricks)
        std::cout << x.first << "," << x.second << " ";*/


    auto it = bricks.begin();
    search_bricks(it->second, bricks, sorted_bricks);


    // display results
    for (auto i = sorted_bricks.begin(); i != sorted_bricks.end(); ++i)
        cout << *i << " ";




}

我希望提高代码的时间复杂度,以便能够更有效地处理数据,如果有人可以建议我的代码或容器方面需要改进什么,我将非常感激。

I'm tackling a exercise which is supposed to exactly benchmark the time complexity of such code.

The data I'm handling is made up of pairs of strings like this hbFvMF,PZLmRb, each string is present two times in the dataset, once on position 1 and once on position 2 . so the first string would point to zvEcqe,hbFvMF for example and the list goes on....

example dataset of 50k pairs

I've been able to produce code which doesn't have much problem sorting these datasets up to 50k pairs, where it takes about 4-5 minutes. 10k gets sorted in a matter of seconds.

The problem is that my code is supposed to handle datasets of up to 5 million pairs. So I'm trying to see what more I can do. I will post my two best attempts, initial one with vectors, which I thought I could upgrade by replacing vector with unsorted_map because of the better time complexity when searching, but to my surprise, there was almost no difference between the two containers when I tested it. I'm not sure if my approach to the problem or the containers I'm choosing are causing the steep sorting times...

Attempt with vectors:

#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <map>
#include <unordered_map>
#include <stdio.h>
#include <vector>
#include <iterator>
#include <utility>
#include <functional>
#include <algorithm>

using namespace std;


template<typename T>
void search_bricks_backwards(string resume, vector<T>& vec, vector<string>& vec2) {
    int index = 0;
    int temp_index = 0;
    while (true) {
        if (index == vec.size()) {
            vec2.insert(vec2.begin(), vec[temp_index].first); 
            cout << "end of backward search, exitting..." << endl;
            break;


        }
        
        if (vec[index].second == resume) {
            vec2.insert(vec2.begin(), resume);
            

            resume = vec[index].first;
            //vec.erase(vec.begin() + index);
            temp_index = index;

            index = 0;
        }
        
        index++;
    }

}


template<typename T>
void search_bricks(string start, vector<T>& vec, vector<string>& vec2) {
    int index = 0;
    int temp_index = 0;
    while (true) {
        //cout << "iteration " << index << endl;
        if (index == vec.size()) {
            vec2.push_back(vec[temp_index].second);
            
            cout << "all forward bricks sorted" << endl;
            break;


        }
        if (vec[index].first == start) {
            vec2.push_back(vec[index].first);
            
            
            start = vec[index].second;
            //vec.erase(vec.begin() + index);
            temp_index = index;
            index = 0;
            
        }
        
        index++;
    }

    search_bricks_backwards(vec[0].first, vec, vec2);

}

template<typename T>
void search_bricks_recursion(string start, vector<T>& vec, vector<string>& vec2) {
    int index = 0;
    for (const auto& pair : vec) {
        //cout << "iteration " << index << endl;
        if (pair.first == start) {
            vec2.push_back(start);
            cout << "found " << start << " and " << pair.first << endl;
            search_bricks(pair.second, vec, vec2);
        }
        if (index + 1 == vec.size()) {
            search_bricks_backwards(start, vec, vec2);
            

        }
        index++;
    }
    
}

template<typename T>
void printVectorElements(vector<T>& vec)
{
    for (auto i = 0; i < vec.size(); ++i) {
        cout << "(" << vec.at(i).first << ","
            << vec.at(i).second << ")" << " ";
    }
    cout << endl;
}

vector<string> split(string s, string delimiter) {
    size_t pos_start = 0, pos_end, delim_len = delimiter.length();
    string token;
    vector<string> res;

    while ((pos_end = s.find(delimiter, pos_start)) != string::npos) {
        token = s.substr(pos_start, pos_end - pos_start);
        pos_start = pos_end + delim_len;
        res.push_back(token);
    }

    res.push_back(s.substr(pos_start));
    return res;
}

unordered_map<string, string> brick_to_map(string const& s)
{
    unordered_map<string, string> m;

    string key, val;
    istringstream iss(s);

    while (getline(getline(iss, key, ',') >> ws, val))
        m[key] = val;

    return m;
}

int main()
{
    vector<pair<string, string>> bricks;
    
    vector<string> sorted_bricks;

    ifstream inFile;
    inFile.open("input-pairs-50K.txt"); //open the input file

    stringstream strStream;
    strStream << inFile.rdbuf(); //read the file
    string str = strStream.str(); //str holds the content of the file

    //cout << str << endl;
    
    istringstream iss(str);
    
    for (string line; getline(iss, line); )
    {
     
        string delimiter = ",";
        string s = line;
        vector<string> v = split(s, delimiter);
        string s1 = v.at(0);
        string s2 = v.at(1);
        

        bricks.push_back(make_pair(s1, s2));
    }

   
    search_bricks(bricks[0].second, bricks, sorted_bricks);
    
    
    //display the results
    for (auto i = sorted_bricks.begin(); i != sorted_bricks.end(); ++i)
        cout << *i << " ";


    
 
}

Attempt with unsorted map:

#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <map>
#include <unordered_map>
#include <stdio.h>
#include <vector>
#include <iterator>
#include <utility>
#include <functional>
#include <algorithm>

using namespace std;


void search_bricks_backwards(string resume, unordered_map<string, string> brick_map, vector<string>& vec2) {
    
    typedef unordered_map<string, string>::value_type map_value_type;
    while (true) {

        unordered_map<string, string>::const_iterator got = find_if(brick_map.begin(), brick_map.end(), [&resume](const map_value_type& vt)
            { return vt.second == resume; }
        );
        if (got == brick_map.end()) {
            vec2.insert(vec2.begin(), resume); 
            cout << "end of backward search, exitting..." << endl;
            break;


        }
        //cout << "iteration " << index << endl;
        else if (got->second == resume) {
            vec2.insert(vec2.begin(), resume);

            
            resume = got->first;
        
        }

       
    }

}


void search_bricks(string start, unordered_map<string, string> brick_map, vector<string>& vec2) {
    
    typedef unordered_map<string, string>::value_type map_value_type;
    while (true) {
        

        unordered_map<string, string>::const_iterator got = find_if(brick_map.begin(), brick_map.end(), [&start](const map_value_type& vt)
            { return vt.first == start; }
        );
        if (got == brick_map.end()) {
            vec2.push_back(start);

            cout << "all forward bricks sorted" << endl;
            
            break;
        }
        else if (got->first == start) {
            vec2.push_back(start);

            //cout << "found " << start << " and " << vec[index].first << endl;
            start = got->second;
            
        }
    }
    auto it = brick_map.begin();
    search_bricks_backwards(it->first, brick_map, vec2);
    


    

    

}


template<typename T>
void printVectorElements(vector<T>& vec)
{
    for (auto i = 0; i < vec.size(); ++i) {
        cout << "(" << vec.at(i).first << ","
            << vec.at(i).second << ")" << " ";
    }
    cout << endl;
}

vector<string> split(string s, string delimiter) {
    size_t pos_start = 0, pos_end, delim_len = delimiter.length();
    string token;
    vector<string> res;

    while ((pos_end = s.find(delimiter, pos_start)) != string::npos) {
        token = s.substr(pos_start, pos_end - pos_start);
        pos_start = pos_end + delim_len;
        res.push_back(token);
    }

    res.push_back(s.substr(pos_start));
    return res;
}


int main()
{
    unordered_map<string, string> bricks;
    
    vector<string> sorted_bricks;

    ifstream inFile;
    inFile.open("input-pairs-50K.txt"); //open the input file

    for (string line; getline(inFile, line); )
    {

        string delimiter = ",";
        string s = line;
        vector<string> v = split(s, delimiter);
        string s1 = v.at(0);
        string s2 = v.at(1);


        bricks.insert(make_pair(s1, s2));
    }


    /*for (auto& x : bricks)
        std::cout << x.first << "," << x.second << " ";*/


    auto it = bricks.begin();
    search_bricks(it->second, bricks, sorted_bricks);


    // display results
    for (auto i = sorted_bricks.begin(); i != sorted_bricks.end(); ++i)
        cout << *i << " ";




}

I'm looking to improve the time complexity of my code to be able to process the data more eficiently, if anyone can suggest what to improve in my code or container wise I'd be very thankful.

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(3

累赘 2025-01-16 05:25:15

首先,以目的为导向,类比一下这里真正所做的事情。这个问题有时会出现在面试问题中。它通常被表述为一组巴士票:

您正走在街上,手里拿着厚厚一叠巴士票,准备旋风般地游览周边国家。每张票都有一个出发城市和一个结束城市。你不小心把票掉在地上,它们就被吹得到处都是。你拿起它们,但随后发现它们坏了。您的任务是将它们按顺序放回原处,这样您就不必在每次需要使用下一张票时都搜索堆栈。

例如,其中 Sn 是 s 公交车站 ID,Sn->Sm 表示从车站 Sn 到车站 Sm 的行程。给定以下四张票,覆盖五个车站,没有特定的顺序:

S4->S2
S1->S5
S3->S4
S5->S3

正确的顺序可以这样想:

S1->S5
    S5->S3
        S3->S4
            S4->S2

因此,正确的“排序”顺序是

S1
S5
S3
S4
S2

算法分析

  • 算法中最大的杀手是那些序列扫描。地图提供关键查找操作。地图 find 成员是让它们勾选(并发光)的原因。如果你想发出这种尖叫声,你需要使用键控来做到这一点。
  • 紧随其后的是惩罚性的顺序扫描,即预先挂起的 std::vector 的高昂费用,这可能会变得非常昂贵。向量是连续的连续存储。附加到它们并没有那么糟糕,因为它们的子分配器往往会在扩展时过度分配,以便在必须重新分配之前为更多的推迟留出空间。但是前置它们是可怕的。

我扫描了您提供的源测试。数据中实际上有 49999 行,而不是 50000 行。经过一番尝试、错误、绞尽脑汁等之后,我发现:

  • bWyUVV 仅出现一次,作为左侧。
  • EZkYGM 在文件中仅出现一次,作为右侧

这些必须是排序列表的终结符。如果一切顺利并且数据是原始的,最终列表将以 bWyUVV 开头,并以 EZkYGM 结尾 对于编写算法没有帮助,但对于验证我们所做的绝对有帮助正确的事情。


全面的性能改进

在保留基本前提的同时,删除很多代码,请考虑以下几点:

#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <unordered_map>
#include <deque>
#include <algorithm>
#include <chrono>

void search_bricks_backwards
(
    std::string resume, 
    std::unordered_map<std::string, std::string>& rbricks, 
    std::deque<std::string>& dst
) 
{
    while (true) 
    {
        auto it = rbricks.find(resume);
        dst.emplace_front(std::move(resume));

        if (it == rbricks.end()) 
            break;

        resume = it->second;
        rbricks.erase(it);
    }
}

void search_bricks
(
    std::unordered_map<std::string, std::string>& bricks, 
    std::unordered_map<std::string, std::string>& rbricks, 
    std::deque<std::string>& dst
) 
{
    // remember these
    std::string start = bricks.begin()->second;
    std::string resume = bricks.begin()->first;

    while (true) 
    {
        auto it = bricks.find(start);
        dst.emplace_back(std::move(start));

        if (it == bricks.end()) 
            break;

        start = it->second;
        bricks.erase(it);
    }

    // same search, but different keyed map
    search_bricks_backwards(resume, rbricks, dst);
}


int main()
{
    using namespace std::chrono;

    std::unordered_map<std::string, std::string> bricks, rbricks;
    std::deque<std::string> sorted_bricks;

    std::ifstream inFile("input-pairs-50K.txt");
    if (inFile.is_open())
    {
        for (std::string line; std::getline(inFile, line);)
        {
            // make damn sure we get two keys
            std::istringstream iss(line);
            std::string s1, s2;
            if (std::getline(iss, s1, ',') && 
                std::getline(iss, s2) &&
                !s1.empty() &&
                !s2.empty())
            {
                bricks.emplace(std::make_pair(s1, s2));
                rbricks.emplace(std::make_pair(s2, s1));
            }
        }

        auto tp0 = high_resolution_clock::now();
        search_bricks(bricks, rbricks, sorted_bricks);
        auto tp1 = high_resolution_clock::now();

        std::cout << duration_cast<milliseconds>(tp1-tp0).count() << "ms\n";

        // display results
        int n = 0;
        for (auto i = sorted_bricks.begin(); i != sorted_bricks.end(); ++i)
            std::cout << ++n << ". " << *i << '\n';
    }
}

最显着的差异:

  • 使用 std ::deque 作为排序结果。您为这些矢量展示位置付出了高昂的代价。所有向量操作要么在后面推动(对于保留向量来说可以),要么在前面推动(向量中的性能很糟糕)。 std::deque 专门用于非常快速的前后插入和修剪。我们不做后者,而是大量做前者。

  • 两个映射,分别键控 s1==>s2 和 s2==>s1。有第三方容器可以为您执行此操作(例如: boost::bimap)。然而,为此,考虑到密钥大小,仅存储两个映射很容易。

  • 在适当的情况下使用移动语义(尽管不多)

  • 在搜索过程中,每个发现的键都会从刚刚搜索的地图中删除。这确保我们在应该停止的时候停止,并相当快地恢复内存占用。

  • 适用时参考参数。这些容器在构建结果集时被有效地销毁,但没关系(实际上可以保证正确检测一端或另一端的终端)。

双键操作带来了天壤之别。使用您的测试数据集在一台四年前的小型 MacBook Pro 笔记本电脑上构建的发布版本会产生以下结果(您可以通过已知答案测试来验证)。


所有代码都是使用 -O2 优化构建的,运行 Apple clang 版本 13.0.0 (clang-1300.0.29.30)

std::unordered_map

34ms
1. bWyUVV
2. mPBGRC
3. WfkvWy
4. vjWNHY
5. HudtyD
6. DhxjdV
7. kdWhGX
8. tIsDXh
9. eMVeMX
10. fVQoeG

... 49980 lines omitted ...

49990. YfBDnP
49991. sHKVrT
49992. ZzhoZV
49993. Dyunmj
49994. KCQbpj
49995. rbMSgD
49996. WKOksU
49997. qqMTnq
49998. llrqUI
49999. XYpBnk
50000. EZkYGM

该时序图相当一致,尽管有在我的测试中,异常值高达 42 毫秒,低至 28 毫秒。使用常规 std::map 的相同线束会产生:

std::map

92ms
1. bWyUVV
2. mPBGRC
3. WfkvWy
4. vjWNHY
5. HudtyD
6. DhxjdV
7. kdWhGX
8. tIsDXh
9. eMVeMX
10. fVQoeG

... 49980 lines omitted ...

49990. YfBDnP
49991. sHKVrT
49992. ZzhoZV
49993. Dyunmj
49994. KCQbpj
49995. rbMSgD
49996. WKOksU
49997. qqMTnq
49998. llrqUI
49999. XYpBnk
50000. EZkYGM

因此,您肯定使用了带有无序容器的正确容器。键控,您实际上并没有将键控用于算法的重要部分,使得容器基本上不比顺序扫描更好。这与您的评估相符,它确实比矢量顺序扫描解决方案好不了多少;不管怎样,你基本上都是这么做的。

我怀疑上面显示的性能应该能够运行您预期的 500 万对,只要您可以将其全部保留在内存中(两次,抱歉,但结果非常坦率地表明它值得这个价格)。


内存更友好(至少有一点)

以下代码执行相同的操作,但没有添加所有输出,并使用单个被调用函数以及飞行中转置的地图。这从一开始就减少了对两张地图的需求。整体性能与上面的代码进行比较;只是一个不同的、更内存友好的实现。

#include <iostream>
#include <fstream>
#include <algorithm>
#include <string>
#include <sstream>
#include <deque>
#include <unordered_map>
#include <chrono>

using map_type = std::unordered_map<std::string, std::string>;

std::deque<std::string> sort_bricks( map_type& bricks )
{
    std::deque<std::string> dst;

    // remember these
    std::string start = bricks.begin()->second;
    std::string resume = bricks.begin()->first;

    // process by append
    while (true) 
    {
        auto it = bricks.find(start);
        dst.emplace_back(std::move(start));

        if (it == bricks.end()) 
            break;

        start = std::move(it->second);
        bricks.erase(it);
    }

    // invert the remaining map
    {
        map_type mres;
        for (auto& pr : bricks)
            mres.emplace(std::move(pr.second), pr.first);
        std::swap(bricks, mres);
    }
    
    // process by prepend
    while (true) 
    {
        auto it = bricks.find(resume);
        dst.emplace_front(std::move(resume));

        if (it == bricks.end()) 
            break;

        resume = std::move(it->second);
        bricks.erase(it);
    }

    return dst;
}

int main()
{
    using namespace std::chrono;

    std::ifstream inFile("input-pairs-50K.txt");
    if (inFile.is_open())
    {
        map_type bricks;
        for (std::string line; std::getline(inFile, line);)
        {
            std::istringstream iss(line);
            std::string s1, s2;
            if (std::getline(iss, s1, ',') && 
                std::getline(iss, s2) &&
                !s1.empty() &&
                !s2.empty())
            {
                bricks.emplace(std::make_pair(s1, s2));
            }
        }

        auto tp0 = high_resolution_clock::now();
        auto sorted_bricks = sort_bricks(bricks);
        auto tp1 = high_resolution_clock::now();

        std::cout << "Size: " << sorted_bricks.size() << '\n';
        std::cout << "Time: " << duration_cast<milliseconds>(tp1-tp0).count() << "ms\n";
    }
}

First a purpose-driven analogy to what is really being done here. This problem sometimes comes up in interview questions. It is often phrased as a cluster of bus tickets:

You're walking down the street with a thick stack of bus tickets for your whirlwind tour of the surrounding countries. Each ticket has a starting city, and an ending city. You accidentally drop the tickets on the ground and they blow all over the place. You pick them up, but then realize they're out of order. Your task is to put them back in order so you don't have to search the stack each time you need to use the next ticket.

An example, where Sn is s bus station ID, and Sn->Sm denotes a trip from station Sn to station Sm. Given the following four tickets covering five stations in no particular order:

S4->S2
S1->S5
S3->S4
S5->S3

the proper order can be thought of like this:

S1->S5
    S5->S3
        S3->S4
            S4->S2

And therefore, the correct "sorted" order is

S1
S5
S3
S4
S2

Algorithm Analysis

  • The biggest killer in your algorithm are those sequence scans. Maps offer key lookup operations. A map find member is what makes them tick (and shine). If you want to make this scream, you need to use keying to do so.
  • A close runner up to that punishing sequential scan that is the exorbitant expense of pre-pending to a std::vector, which can get ridiculously expensive fast. Vectors a sequential contiguous storage. Appending to them is not quite as bad because their sub-allocators tend over-allocate on expansion to leave room for a few more push-backs before having to reallocate. But prepending to them is dreadful.

I scanned the source test you provided. There are actually 49999 rows in the data, not 50000. After some trial, error, head scratching, etc., I discovered this:

  • bWyUVV appears only once, as a left side.
  • EZkYGM appears in the file only once, as a right side

These must be the terminals of the sorted list. If everything plays out and the data is pristine, the final list will start with bWyUVV, and end with EZkYGM Not helpful for writing the algorithm, but definitely helpful for validating we did something right.


Performance Improvements All Around

Stripping a lot of code out of this while still keeping the basic premise, consider the following:

#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <unordered_map>
#include <deque>
#include <algorithm>
#include <chrono>

void search_bricks_backwards
(
    std::string resume, 
    std::unordered_map<std::string, std::string>& rbricks, 
    std::deque<std::string>& dst
) 
{
    while (true) 
    {
        auto it = rbricks.find(resume);
        dst.emplace_front(std::move(resume));

        if (it == rbricks.end()) 
            break;

        resume = it->second;
        rbricks.erase(it);
    }
}

void search_bricks
(
    std::unordered_map<std::string, std::string>& bricks, 
    std::unordered_map<std::string, std::string>& rbricks, 
    std::deque<std::string>& dst
) 
{
    // remember these
    std::string start = bricks.begin()->second;
    std::string resume = bricks.begin()->first;

    while (true) 
    {
        auto it = bricks.find(start);
        dst.emplace_back(std::move(start));

        if (it == bricks.end()) 
            break;

        start = it->second;
        bricks.erase(it);
    }

    // same search, but different keyed map
    search_bricks_backwards(resume, rbricks, dst);
}


int main()
{
    using namespace std::chrono;

    std::unordered_map<std::string, std::string> bricks, rbricks;
    std::deque<std::string> sorted_bricks;

    std::ifstream inFile("input-pairs-50K.txt");
    if (inFile.is_open())
    {
        for (std::string line; std::getline(inFile, line);)
        {
            // make damn sure we get two keys
            std::istringstream iss(line);
            std::string s1, s2;
            if (std::getline(iss, s1, ',') && 
                std::getline(iss, s2) &&
                !s1.empty() &&
                !s2.empty())
            {
                bricks.emplace(std::make_pair(s1, s2));
                rbricks.emplace(std::make_pair(s2, s1));
            }
        }

        auto tp0 = high_resolution_clock::now();
        search_bricks(bricks, rbricks, sorted_bricks);
        auto tp1 = high_resolution_clock::now();

        std::cout << duration_cast<milliseconds>(tp1-tp0).count() << "ms\n";

        // display results
        int n = 0;
        for (auto i = sorted_bricks.begin(); i != sorted_bricks.end(); ++i)
            std::cout << ++n << ". " << *i << '\n';
    }
}

The most notable differences:

  • Using std::deque<std::string> as the sorted results. You were paying a dear price for those vector placements. All of your vector operations were either pushing on the back (ok for reserved vectors) or pushing on the front (dreadful performance in vectors). The std::deque is specialized for very fast front and back insertion and pruning. We're not doing the latter, but are heavily doing the former.

  • Two maps, keying s1==>s2 and s2==>s1 respectively. there are third party containers that do this for you (ex: boost::bimap). For this, however, considering the key sizes, just storing two maps is easy.

  • Move semantics are used (though not much) where appropriate

  • During searches, each discovered key is deleted from the map just-searched. This ensures we stop when we're supposed to, and recovers memory footprint fairly quickly.

  • Reference parameters where applicable. These containers are effectively destroyed while building the result set, but that's ok (and in fact warranted to properly detect a terminal on one end or the other).

The double-keying makes a world of difference. A release build using your test data set on a puny little four-year-old macbook pro laptop produces the following (which you can verify with your know-answer tests).


All code is built with -O2 optimization running Apple clang version 13.0.0 (clang-1300.0.29.30)

std::unordered_map

34ms
1. bWyUVV
2. mPBGRC
3. WfkvWy
4. vjWNHY
5. HudtyD
6. DhxjdV
7. kdWhGX
8. tIsDXh
9. eMVeMX
10. fVQoeG

... 49980 lines omitted ...

49990. YfBDnP
49991. sHKVrT
49992. ZzhoZV
49993. Dyunmj
49994. KCQbpj
49995. rbMSgD
49996. WKOksU
49997. qqMTnq
49998. llrqUI
49999. XYpBnk
50000. EZkYGM

That timing figure is fairly consistent, though there were outliers as high as 42ms and as low as 28ms in my testing. The same harness using a regular std::map resulted in:

std::map

92ms
1. bWyUVV
2. mPBGRC
3. WfkvWy
4. vjWNHY
5. HudtyD
6. DhxjdV
7. kdWhGX
8. tIsDXh
9. eMVeMX
10. fVQoeG

... 49980 lines omitted ...

49990. YfBDnP
49991. sHKVrT
49992. ZzhoZV
49993. Dyunmj
49994. KCQbpj
49995. rbMSgD
49996. WKOksU
49997. qqMTnq
49998. llrqUI
49999. XYpBnk
50000. EZkYGM

So you're definitely using the right container with an unordered keying, you just weren't actually using the keying for a non-trivial portion of your algorithm, making the container basically no better than a sequential scan. That jives with your assessment it really wasn't any better than a vector sequential scan solution; you were basically doing that regardless.

I suspect performance shown above should be able to run sets of your expected 5-million pairs, so long as you can keep it all in memory (twice, sorry about that, but the results show pretty candidly that it is worth that price).


More Memory Friendly (at least a little)

The following does the same thing, but without all the added output, and uses a single called function with the map transposed mid-flight. This alleviates the need for two maps from inception. The overall performance is comparative to the code above; just a different, more memory friendly, implementation.

#include <iostream>
#include <fstream>
#include <algorithm>
#include <string>
#include <sstream>
#include <deque>
#include <unordered_map>
#include <chrono>

using map_type = std::unordered_map<std::string, std::string>;

std::deque<std::string> sort_bricks( map_type& bricks )
{
    std::deque<std::string> dst;

    // remember these
    std::string start = bricks.begin()->second;
    std::string resume = bricks.begin()->first;

    // process by append
    while (true) 
    {
        auto it = bricks.find(start);
        dst.emplace_back(std::move(start));

        if (it == bricks.end()) 
            break;

        start = std::move(it->second);
        bricks.erase(it);
    }

    // invert the remaining map
    {
        map_type mres;
        for (auto& pr : bricks)
            mres.emplace(std::move(pr.second), pr.first);
        std::swap(bricks, mres);
    }
    
    // process by prepend
    while (true) 
    {
        auto it = bricks.find(resume);
        dst.emplace_front(std::move(resume));

        if (it == bricks.end()) 
            break;

        resume = std::move(it->second);
        bricks.erase(it);
    }

    return dst;
}

int main()
{
    using namespace std::chrono;

    std::ifstream inFile("input-pairs-50K.txt");
    if (inFile.is_open())
    {
        map_type bricks;
        for (std::string line; std::getline(inFile, line);)
        {
            std::istringstream iss(line);
            std::string s1, s2;
            if (std::getline(iss, s1, ',') && 
                std::getline(iss, s2) &&
                !s1.empty() &&
                !s2.empty())
            {
                bricks.emplace(std::make_pair(s1, s2));
            }
        }

        auto tp0 = high_resolution_clock::now();
        auto sorted_bricks = sort_bricks(bricks);
        auto tp1 = high_resolution_clock::now();

        std::cout << "Size: " << sorted_bricks.size() << '\n';
        std::cout << "Time: " << duration_cast<milliseconds>(tp1-tp0).count() << "ms\n";
    }
}
寻找我们的幸福 2025-01-16 05:25:15

您可以使用 trie 数据结构,这里有一篇论文解释了执行此操作的算法: https://people.eng.unimelb.edu.au/jzobel/fulltext/acsc03sz.pdf

但是你必须从头开始实现 trie,因为据我所知知道 C++ 中没有默认的 trie 实现。

You can use a trie data structure, here's a paper that explains an algorithm to do that: https://people.eng.unimelb.edu.au/jzobel/fulltext/acsc03sz.pdf

But you have to implement the trie from scratch because as far as I know there is no default trie implementation in c++.

红焚 2025-01-16 05:25:15

回复较晚,但通过微小的改进,我们可以将处理时间加快 100%。或者将运行时间减少一半。

基本上 WhozCraig 选择的方法已经接近最佳。

通过使用 std::string_view 并选择不同的字符串分割方法,我可以将性能提高近 100%。大部分时间消耗在读取源文件和分割数据上。通过我选择的不同方法,我可以节省最多的时间。我添加了一些额外的小改进。

我用一个包含 5.000.000 对的大文件检查了这两种算法。我创建了一个模块来生成这样的测试文件。请查看下面的源代码。

我将展示我的源代码,给出的代码是“WhozCraig”,针对基准测试稍作修改。

但首先,请以屏幕截图形式查看基准测试结果。

使用问题中给出的原始小文件进行测试。我的解决方案:
输入图片此处描述

使用小测试文件的“WhozCraig”解决方案

在此处输入图像描述


然后我创建了一个 5M 对测试文件并运行相同的测试。我的方法

在此处输入图像描述

使用大测试文件的“WhozCraig”解决方案

在此处输入图像描述

请注意。使用“WhozCraig”的方法对大数据进行排序甚至更快。

但阅读速度较慢。

我没有调查原因。


当然,所有这些都取决于机器和编译器。我在一台已有 10 年历史的 Windows 7 机器上运行了测试。编译器是Visual Studio 2019社区版。以发布模式编译。


请参阅下面的代码示例:

#include <iostream>
#include <fstream>
#include <string>
#include <chrono>
#include <unordered_map>
#include <random>
#include <string_view>
#include <deque>
#include <iterator>
#include <unordered_set>
#include <algorithm>
#include <array>
#include <utility>

// ----------------------------------------------------------------------------------------------------------------
// Generate test data
// We will generate that many strings
constexpr size_t MaxPairs = 5'000'000u;

// The strings will all be 6 characters long and consist of upper- and lowercase letters
constexpr size_t StringLength = 6u;
constexpr std::array<char, 52> Letter{ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
                                       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' };

// Filename for test data
const std::string testFileName{"test.txt"};

// Create a file with test date following the given pattern
void createTestData() {

    // We need random numbers to select a letter from above string
    std::random_device randomDevice;
    std::mt19937 randomGenerator(randomDevice());
    std::uniform_int_distribution<> randomIntDistribution(0, Letter.size() - 1);

    // Function to create a random string
    auto generateString = [&]() -> std::string
    {
        std::string s(6, 0);
        for (size_t i{}; i < StringLength; ++i) 
            s[i] = Letter[randomIntDistribution(randomGenerator)]; 
        return s; };

    // We want to have unique, not sorted strings. So, use a std::unordered_set
    std::unordered_set<std::string> uniqueStrings{};

    // Now generate the requested numbers of unique strings
    while (uniqueStrings.size() < MaxPairs + 1) 
        uniqueStrings.insert(generateString());
    
    // Move them to a std::vector for random access possibility
    std::vector<std::string> sss(std::make_move_iterator(uniqueStrings.begin()), std::make_move_iterator(uniqueStrings.end()));

    // Now we create a vectore of pair of indices, like 0->1, 1->2, 2->3 and so on
    std::vector<std::pair<int, int>> index(MaxPairs);
    for (int i = 0; i < MaxPairs; ++i) {
        index[i].first = i;
        index[i].second = i + 1;
    }
    // This we will shuffle
    std::shuffle(index.begin(), index.end(), randomGenerator);

    // And then store the data in a text file
    if (std::ofstream ofs{ testFileName }; ofs) {
        for (size_t i = 0; i < MaxPairs; ++i)
            ofs << sss[index[i].first] << ',' << sss[index[i].second] << '\n';
    }
}

// We make the bucket size 3 times bigger then the number of elements.
// Maybe, but actually I do not know, this will reduce collisions
constexpr int MaxBucket = 16'777'216u;

// A very small speed boost by a bigger input buffer
constexpr size_t ifStreamBufferSize = 1'000'000u;
static char buffer[ifStreamBufferSize];

// And a more simple and faster hash funcion
struct MyHash {
    size_t operator()(const std::string_view& s) const {
        size_t hash = 0;
        for (const char c : s) hash = hash * 101 + c;
        return hash;
    }
};

int main() {
    // Please uncomment, if you want to create test Data
    //createTestData();

    // Start runtime measurement
    auto tp1 = std::chrono::high_resolution_clock::now();

    // Open source file and check if it could be opened
    if (std::ifstream ifs{ testFileName }; ifs) {

        // Set bigger file read buffer. Not sure, if it has really an effect.
        ifs.rdbuf()->pubsetbuf(buffer, ifStreamBufferSize);

        // Read complete source file into a string
        std::string text(std::istreambuf_iterator<char>(ifs), {});

        // We ant to identify the pair. A pair of a relation consists of a left and right part. Start of course with left
        bool readLeftSideOfRelation = true;

        // Start value for data analysis
        char* currentCharPtr = text.data();
        // End of string
        const char* const textEndPtr = currentCharPtr + text.length();

        // Define string views for the left and the right side of the relation 
        std::string_view firstView(currentCharPtr, StringLength);
        std::string_view secondView;

        // This is the simulation of a bimap
        std::unordered_map<std::string_view, std::string_view, MyHash> map1{};
        std::unordered_map<std::string_view, std::string_view, MyHash> map2{};
        // A liitle bit speed gain by reservin buffer space
        map1.reserve(MaxBucket);
        map2.reserve(MaxBucket);

        // Split the strings, create string_views and store in maps
        size_t count = 0;
        for (; currentCharPtr < textEndPtr; ++currentCharPtr, ++count) {

            // Split citerium
            if (*currentCharPtr < 'A') {

                // Set pointer to one after the delimiting character
                ++currentCharPtr;

                // Either we are just reading the left or the right side
                if (readLeftSideOfRelation)
                    secondView = { currentCharPtr , count };
                else
                {
                    // Now we read the right side of the relation. And with that a pair
                    // Store relation in each direction
                    map1[firstView] = secondView;
                    map2[secondView] = firstView;

                    // Start start of next left side
                    firstView = { currentCharPtr , count };
                }
                // Next, we want to read the other part of the relation
                readLeftSideOfRelation = not readLeftSideOfRelation;
                count = 0;
            }
        }
        // Here we will store the sorted result
        std::deque<std::string_view> result{};

        auto tp3 = std::chrono::high_resolution_clock::now();

        // Set parts of the relation
        std::string_view related = map1.begin()->second;
        std::string_view relatedSave = map1.begin()->first;

        // Go through the sequence
        for (;;) {
            auto it = map1.find(related);
            result.push_back(related);
            if (it == map1.end())
                break;
            related = it->second;
            map1.erase(it);
        }
        // We reached end, search in other direction
        related = relatedSave;
        for (;;) {
            auto it = map2.find(related);
            result.push_front(related);

            if (it == map2.end())
                break;
            related = it->second;
            map2.erase(it);
        }

        // Show timing result
        auto tp4 = std::chrono::high_resolution_clock::now();
        std::cout << "Sort: " << duration_cast<std::chrono::milliseconds>(tp4 - tp3).count() << "ms\n";

        auto tp2 = std::chrono::high_resolution_clock::now();
        std::cout << "Overall: " << duration_cast<std::chrono::milliseconds>(tp2 - tp1).count() << "ms\n";
    
        std::cout << "Count:  " << result.size() << '\n';
    }
}


“WhozCraig”的解决方案,用于附加运行时间测量并用于测试。

#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <unordered_map>
#include <deque>
#include <algorithm>
#include <chrono>

void search_bricks_backwards
(
    std::string resume,
    std::unordered_map<std::string, std::string>& rbricks,
    std::deque<std::string>& dst
)
{
    while (true)
    {
        auto it = rbricks.find(resume);
        dst.emplace_front(std::move(resume));

        if (it == rbricks.end())
            break;

        resume = it->second;
        rbricks.erase(it);
    }
}

void search_bricks
(
    std::unordered_map<std::string, std::string>& bricks,
    std::unordered_map<std::string, std::string>& rbricks,
    std::deque<std::string>& dst
)
{
    // remember these
    std::string start = bricks.begin()->second;
    std::string resume = bricks.begin()->first;

    while (true)
    {
        auto it = bricks.find(start);
        dst.emplace_back(std::move(start));

        if (it == bricks.end())
            break;

        start = it->second;
        bricks.erase(it);
    }
    // same search, but different keyed map
    search_bricks_backwards(resume, rbricks, dst);
}


int main()
{
    using namespace std::chrono;
    //createTestData();
    auto tp0 = high_resolution_clock::now();

    std::unordered_map<std::string, std::string> bricks, rbricks;
    std::deque<std::string> sorted_bricks;

    std::ifstream inFile("test.txt");
    if (inFile.is_open())
    {
        for (std::string line; std::getline(inFile, line);)
        {
            // make damn sure we get two keys
            std::istringstream iss(line);
            std::string s1, s2;
            if (std::getline(iss, s1, ',') &&
                std::getline(iss, s2) &&
                !s1.empty() &&
                !s2.empty())
            {
                bricks.emplace(std::make_pair(s1, s2));
                rbricks.emplace(std::make_pair(s2, s1));
            }
        }
        auto tp3 = high_resolution_clock::now();

        search_bricks(bricks, rbricks, sorted_bricks);

        auto tp4 = high_resolution_clock::now();
        auto tp1 = high_resolution_clock::now();
        std::cout << "\nSort:  "<< duration_cast<milliseconds>(tp4 - tp3).count() << "ms\n";
        std::cout << "Overall:  " << duration_cast<milliseconds>(tp1 - tp0).count() << "ms\n";

        std::cout << "Count:  " << sorted_bricks.size() << '\n';
    }
}


在用户“subspring”的附加回答中,提出了一个特里树。

我认为这是不可行的。查看潜在的内存消耗。

我们有大写和小写字符。所以,总共 52 个字符。对于每个我们都需要一个 4 字节指针 + 1 个相关字符串的指针。那是
53*4 用于特里树的一层。字符串长度为 6。我们需要尝试 2 次。因此,经典树的总体内存消耗为:

((52+1)*4) ^ 6 * 2 = 181'570'446'368'768 ~182 TB。

与这个数字相比,我电脑中的 128GB RAM 小得离谱。

因此,我使用 std:unordered_map 作为链接元素实现了一个具有空间优化设计的 trie。

这样我就可以将 RAM 内存消耗减少到 25GB。

但这里的结果也令人失望。

就算是有好意,也至少慢了三倍。 。 。

请参阅示例代码:

#include <iostream>
#include <fstream>
#include <string>
#include <chrono>
#include <unordered_map>
#include <vector>
#include <random>
#include <unordered_set>
#include <array>
#include <deque>
#include <list>
// ----------------------------------------------------------------------------------------------------------------
// Generate test data
// We will generate that many strings
constexpr size_t MaxPairs = 1'000'000u;
//constexpr size_t MaxPairs = 500000u;

// The strings will all be 6 characters long and consist of upper- and lowercase letters
constexpr size_t StringLength = 6u;
constexpr std::array<char, 52> Letter{ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
                                       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' };

// Filename for test data
const std::string testFileName{ "r:\\text1m.txt" };

// Create a file with test date following the given pattern
void createTestData() {

    // We need random numbers to select a letter from above string
    std::random_device randomDevice;
    std::mt19937 randomGenerator(randomDevice());
    std::uniform_int_distribution<> randomIntDistribution(0, (int)Letter.size() - 1);

    // Function to create a random string
    auto generateString = [&]() -> std::string
    {
        std::string s(6, 0);
        for (size_t i{}; i < StringLength; ++i)
            s[i] = Letter[randomIntDistribution(randomGenerator)];
        return s; };

    // We want to have unique, not sorted strings. So, use a std::unordered_set
    std::unordered_set<std::string> uniqueStrings{};

    // Now generate the requested numbers of unique strings
    while (uniqueStrings.size() < MaxPairs + 1)
        uniqueStrings.insert(generateString());

    // Move them to a std::vector for random access possibility
    std::vector<std::string> sss(std::make_move_iterator(uniqueStrings.begin()), std::make_move_iterator(uniqueStrings.end()));

    // Now we create a vectore of pair of indices, like 0->1, 1->2, 2->3 and so on
    std::vector<std::pair<int, int>> index(MaxPairs);
    for (int i = 0; i < MaxPairs; ++i) {
        index[i].first = i;
        index[i].second = i + 1;
    }
    // This we will shuffle
    std::shuffle(index.begin(), index.end(), randomGenerator);

    // And then store the data in a text file
    if (std::ofstream ofs{ testFileName }; ofs) {
        for (size_t i = 0; i < MaxPairs; ++i)
            ofs << sss[index[i].first] << ',' << sss[index[i].second] << '\n';
    }
}


struct TrieNode;
using CharacterWithLink = std::unordered_map<char, TrieNode*>;
using LinkIter = CharacterWithLink::iterator;

struct TrieNode {
    const char* related{};
    CharacterWithLink link{};
};

struct Trie {
    TrieNode root{};
    const char* beginOfWord{};
    LinkIter iter{};
    TrieNode* tnp{};
    std::vector< TrieNode> memory{};
    size_t index{};

    static constexpr size_t MaxMemory = static_cast<size_t>((float)MaxPairs * 3.6);
    //static constexpr size_t MaxMemory = static_cast<size_t>((float)MaxPairs * 10);
    Trie() { memory.resize(MaxMemory); }
    ~Trie() {}

    TrieNode* addWord(const char *characterOfWord) {
        tnp = &root;
        beginOfWord = characterOfWord;

        while (*characterOfWord >= 'A') {
            iter = tnp->link.find(*characterOfWord);

            if (iter == tnp->link.end())
                tnp->link[*characterOfWord] = &memory[index++];
            
            tnp = tnp->link[*characterOfWord];
            ++characterOfWord;
        }
        //tnp->related = beginOfWord;
        return tnp;
    }
    const char* find(const char* characterOfWord) {
        tnp = &root;
        const char* result{};
        while (*characterOfWord >= 'A') {
            iter = tnp->link.find(*characterOfWord);

            if (iter == tnp->link.end()) break;

            tnp = tnp->link[*characterOfWord];
            ++characterOfWord;
        }
        result = tnp->related;
        return result;
    }
};

struct BiRelationalTrie {
    Trie left{};
    Trie right{};

    void fill(std::string& text);
};

void BiRelationalTrie::fill(std::string & text) {

    const char* currentCharPtr{ text.data() };
    const char* textEndPtr{ text.data() + text.length() };

    // We want to identify the pair. A pair of a relation consists of a left and right part. Start of course with left
    bool readLeftSideOfRelation = true;

    const char* leftWordPtr{ text.data() };
    const char* rightWordPtr{};

    for(; currentCharPtr < textEndPtr; ++currentCharPtr) {

        // Split citerium
        if (*currentCharPtr < 'A') {

            // Set pointer to one after the delimiting character
            ++currentCharPtr;

            // Either we are just reading the left or the right side
            if (readLeftSideOfRelation)

                // We were reading the left site, now, set the start pointer for the right side
                rightWordPtr = currentCharPtr;
            else
            {
                // Now we have read the right side of the relation. And with that a complete pair
                // Store relation in each direction
                left.addWord(leftWordPtr)->related = rightWordPtr;
                right.addWord(rightWordPtr)->related = leftWordPtr;

                // Set start of next left side
                leftWordPtr = currentCharPtr;
            }
            // Next, we want to read the other part of the relation
            readLeftSideOfRelation = not readLeftSideOfRelation;
        }
    }
    std::cout << left.index << '\n';
}
// A very small speed boost by a bigger input buffer
constexpr size_t ifStreamBufferSize = 1'000'000u;
static char buffer[ifStreamBufferSize];



int main() {
    //createTestData();
    // Start runtime measurement 
    
    
    BiRelationalTrie biRelationalTrie{};

    // Here we will store the sorted result
    std::deque<const char *> result{};

    auto tp1 = std::chrono::high_resolution_clock::now();

    // Open source file and check if it could be opened
    if (std::ifstream ifs{ testFileName }; ifs) {

        // Set bigger file read buffer. Not sure, if it has really an effect.
        ifs.rdbuf()->pubsetbuf(buffer, ifStreamBufferSize);

        // Read complete source file into a string
        std::string text(std::istreambuf_iterator<char>(ifs), {});

        biRelationalTrie.fill(text);



        auto tp3 = std::chrono::high_resolution_clock::now();

        // Set parts of the relation
        const char *related = text.data()+7;
        const char* relatedSave = text.data();

        // Go through the sequence
        for (;;) {
            related = biRelationalTrie.left.find(related);
            if (related == nullptr)
                break;
            result.push_back(related);
        }
        // We reached end, search in other direction
        related = relatedSave;
        for (;;) {
            related = biRelationalTrie.right.find(related);
            if (related == nullptr)
                break;
            result.push_front(related);
        }

        // Show timing result
        auto tp4 = std::chrono::high_resolution_clock::now();
        std::cout << "Sort: " << std::chrono::duration_cast<std::chrono::milliseconds>(tp4 - tp3).count() << "ms\n";
        std::cout << result.size() << '\n';

        auto tp2 = std::chrono::high_resolution_clock::now();
        std::cout << "Overall: " << std::chrono::duration_cast<std::chrono::milliseconds>(tp2 - tp1).count() << "ms\n";
    }
}

Late answer, but with minor improvements we can speed up the processing time by 100%. Or reduce the runtime by half.

Basically the approach chosen by WhozCraig is already nearly optimal.

I could improve the performance by nearly 100%, by using std::string_view and selecting a different string split approach. Most the time is consumed with reading the source file and splitting the data. With my different selected approach, I could save the most time. I added some additional small improvements.

I checked both algorithms with a big file with 5.000.000 pairs. I created a module, to generate such a test file. Please have a look in the below source code.

I will show my source code and the code given be "WhozCraig", slighty modified for benchmark measurements.

But first, please see the benchmark results as screenshots.

Test with the original small file given in the quesion. My solution:
enter image description here

Solution of "WhozCraig" with small test file

enter image description here


Then I created a 5M pairs test file and run the same test. My approach

enter image description here

Solution of "WhozCraig" with big test file

enter image description here

Please note. Sorting big data with "WhozCraig"'s approach is even faster.

But reading is slower.

I did not investigate why.


Of course, all this is machine and compiler dependent. I ran the test on a 10 years old Windows 7 machine. The compiler is Visual Studio 2019 community editions. Compiled in release mode.


Please see code examples below:

#include <iostream>
#include <fstream>
#include <string>
#include <chrono>
#include <unordered_map>
#include <random>
#include <string_view>
#include <deque>
#include <iterator>
#include <unordered_set>
#include <algorithm>
#include <array>
#include <utility>

// ----------------------------------------------------------------------------------------------------------------
// Generate test data
// We will generate that many strings
constexpr size_t MaxPairs = 5'000'000u;

// The strings will all be 6 characters long and consist of upper- and lowercase letters
constexpr size_t StringLength = 6u;
constexpr std::array<char, 52> Letter{ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
                                       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' };

// Filename for test data
const std::string testFileName{"test.txt"};

// Create a file with test date following the given pattern
void createTestData() {

    // We need random numbers to select a letter from above string
    std::random_device randomDevice;
    std::mt19937 randomGenerator(randomDevice());
    std::uniform_int_distribution<> randomIntDistribution(0, Letter.size() - 1);

    // Function to create a random string
    auto generateString = [&]() -> std::string
    {
        std::string s(6, 0);
        for (size_t i{}; i < StringLength; ++i) 
            s[i] = Letter[randomIntDistribution(randomGenerator)]; 
        return s; };

    // We want to have unique, not sorted strings. So, use a std::unordered_set
    std::unordered_set<std::string> uniqueStrings{};

    // Now generate the requested numbers of unique strings
    while (uniqueStrings.size() < MaxPairs + 1) 
        uniqueStrings.insert(generateString());
    
    // Move them to a std::vector for random access possibility
    std::vector<std::string> sss(std::make_move_iterator(uniqueStrings.begin()), std::make_move_iterator(uniqueStrings.end()));

    // Now we create a vectore of pair of indices, like 0->1, 1->2, 2->3 and so on
    std::vector<std::pair<int, int>> index(MaxPairs);
    for (int i = 0; i < MaxPairs; ++i) {
        index[i].first = i;
        index[i].second = i + 1;
    }
    // This we will shuffle
    std::shuffle(index.begin(), index.end(), randomGenerator);

    // And then store the data in a text file
    if (std::ofstream ofs{ testFileName }; ofs) {
        for (size_t i = 0; i < MaxPairs; ++i)
            ofs << sss[index[i].first] << ',' << sss[index[i].second] << '\n';
    }
}

// We make the bucket size 3 times bigger then the number of elements.
// Maybe, but actually I do not know, this will reduce collisions
constexpr int MaxBucket = 16'777'216u;

// A very small speed boost by a bigger input buffer
constexpr size_t ifStreamBufferSize = 1'000'000u;
static char buffer[ifStreamBufferSize];

// And a more simple and faster hash funcion
struct MyHash {
    size_t operator()(const std::string_view& s) const {
        size_t hash = 0;
        for (const char c : s) hash = hash * 101 + c;
        return hash;
    }
};

int main() {
    // Please uncomment, if you want to create test Data
    //createTestData();

    // Start runtime measurement
    auto tp1 = std::chrono::high_resolution_clock::now();

    // Open source file and check if it could be opened
    if (std::ifstream ifs{ testFileName }; ifs) {

        // Set bigger file read buffer. Not sure, if it has really an effect.
        ifs.rdbuf()->pubsetbuf(buffer, ifStreamBufferSize);

        // Read complete source file into a string
        std::string text(std::istreambuf_iterator<char>(ifs), {});

        // We ant to identify the pair. A pair of a relation consists of a left and right part. Start of course with left
        bool readLeftSideOfRelation = true;

        // Start value for data analysis
        char* currentCharPtr = text.data();
        // End of string
        const char* const textEndPtr = currentCharPtr + text.length();

        // Define string views for the left and the right side of the relation 
        std::string_view firstView(currentCharPtr, StringLength);
        std::string_view secondView;

        // This is the simulation of a bimap
        std::unordered_map<std::string_view, std::string_view, MyHash> map1{};
        std::unordered_map<std::string_view, std::string_view, MyHash> map2{};
        // A liitle bit speed gain by reservin buffer space
        map1.reserve(MaxBucket);
        map2.reserve(MaxBucket);

        // Split the strings, create string_views and store in maps
        size_t count = 0;
        for (; currentCharPtr < textEndPtr; ++currentCharPtr, ++count) {

            // Split citerium
            if (*currentCharPtr < 'A') {

                // Set pointer to one after the delimiting character
                ++currentCharPtr;

                // Either we are just reading the left or the right side
                if (readLeftSideOfRelation)
                    secondView = { currentCharPtr , count };
                else
                {
                    // Now we read the right side of the relation. And with that a pair
                    // Store relation in each direction
                    map1[firstView] = secondView;
                    map2[secondView] = firstView;

                    // Start start of next left side
                    firstView = { currentCharPtr , count };
                }
                // Next, we want to read the other part of the relation
                readLeftSideOfRelation = not readLeftSideOfRelation;
                count = 0;
            }
        }
        // Here we will store the sorted result
        std::deque<std::string_view> result{};

        auto tp3 = std::chrono::high_resolution_clock::now();

        // Set parts of the relation
        std::string_view related = map1.begin()->second;
        std::string_view relatedSave = map1.begin()->first;

        // Go through the sequence
        for (;;) {
            auto it = map1.find(related);
            result.push_back(related);
            if (it == map1.end())
                break;
            related = it->second;
            map1.erase(it);
        }
        // We reached end, search in other direction
        related = relatedSave;
        for (;;) {
            auto it = map2.find(related);
            result.push_front(related);

            if (it == map2.end())
                break;
            related = it->second;
            map2.erase(it);
        }

        // Show timing result
        auto tp4 = std::chrono::high_resolution_clock::now();
        std::cout << "Sort: " << duration_cast<std::chrono::milliseconds>(tp4 - tp3).count() << "ms\n";

        auto tp2 = std::chrono::high_resolution_clock::now();
        std::cout << "Overall: " << duration_cast<std::chrono::milliseconds>(tp2 - tp1).count() << "ms\n";
    
        std::cout << "Count:  " << result.size() << '\n';
    }
}


Solution of "WhozCraig", instrumented for aditional runtime measurement and used for the tests.

#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <unordered_map>
#include <deque>
#include <algorithm>
#include <chrono>

void search_bricks_backwards
(
    std::string resume,
    std::unordered_map<std::string, std::string>& rbricks,
    std::deque<std::string>& dst
)
{
    while (true)
    {
        auto it = rbricks.find(resume);
        dst.emplace_front(std::move(resume));

        if (it == rbricks.end())
            break;

        resume = it->second;
        rbricks.erase(it);
    }
}

void search_bricks
(
    std::unordered_map<std::string, std::string>& bricks,
    std::unordered_map<std::string, std::string>& rbricks,
    std::deque<std::string>& dst
)
{
    // remember these
    std::string start = bricks.begin()->second;
    std::string resume = bricks.begin()->first;

    while (true)
    {
        auto it = bricks.find(start);
        dst.emplace_back(std::move(start));

        if (it == bricks.end())
            break;

        start = it->second;
        bricks.erase(it);
    }
    // same search, but different keyed map
    search_bricks_backwards(resume, rbricks, dst);
}


int main()
{
    using namespace std::chrono;
    //createTestData();
    auto tp0 = high_resolution_clock::now();

    std::unordered_map<std::string, std::string> bricks, rbricks;
    std::deque<std::string> sorted_bricks;

    std::ifstream inFile("test.txt");
    if (inFile.is_open())
    {
        for (std::string line; std::getline(inFile, line);)
        {
            // make damn sure we get two keys
            std::istringstream iss(line);
            std::string s1, s2;
            if (std::getline(iss, s1, ',') &&
                std::getline(iss, s2) &&
                !s1.empty() &&
                !s2.empty())
            {
                bricks.emplace(std::make_pair(s1, s2));
                rbricks.emplace(std::make_pair(s2, s1));
            }
        }
        auto tp3 = high_resolution_clock::now();

        search_bricks(bricks, rbricks, sorted_bricks);

        auto tp4 = high_resolution_clock::now();
        auto tp1 = high_resolution_clock::now();
        std::cout << "\nSort:  "<< duration_cast<milliseconds>(tp4 - tp3).count() << "ms\n";
        std::cout << "Overall:  " << duration_cast<milliseconds>(tp1 - tp0).count() << "ms\n";

        std::cout << "Count:  " << sorted_bricks.size() << '\n';
    }
}


In an additional answer from user "subspring", a trie was proposed.

I think it is not feasible. Look at the potential memory consumption.

We have upper- and lowercase characters. So, overall 52 characters. For each we would need a 4 byte pointer + 1 pointer for related string. That is
53*4 for one level of the trie. The string length is 6. And we need 2 tries. So, the overall memory consumption for a classical tree would be:

((52+1)*4) ^ 6 * 2 = 181'570'446'368'768 ~182 TB.

The 128GB RAM that is in my computer is rediculous small compared to this number.

For that reason I implemented a trie with a space optimized design using std:unordered_maps as link element.

With that I could reduce the RAM memory consumption to 25GB.

But also here the result was disappointing.

Even with good will, it was at least 3 times slower . . .

See the example code:

#include <iostream>
#include <fstream>
#include <string>
#include <chrono>
#include <unordered_map>
#include <vector>
#include <random>
#include <unordered_set>
#include <array>
#include <deque>
#include <list>
// ----------------------------------------------------------------------------------------------------------------
// Generate test data
// We will generate that many strings
constexpr size_t MaxPairs = 1'000'000u;
//constexpr size_t MaxPairs = 500000u;

// The strings will all be 6 characters long and consist of upper- and lowercase letters
constexpr size_t StringLength = 6u;
constexpr std::array<char, 52> Letter{ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
                                       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' };

// Filename for test data
const std::string testFileName{ "r:\\text1m.txt" };

// Create a file with test date following the given pattern
void createTestData() {

    // We need random numbers to select a letter from above string
    std::random_device randomDevice;
    std::mt19937 randomGenerator(randomDevice());
    std::uniform_int_distribution<> randomIntDistribution(0, (int)Letter.size() - 1);

    // Function to create a random string
    auto generateString = [&]() -> std::string
    {
        std::string s(6, 0);
        for (size_t i{}; i < StringLength; ++i)
            s[i] = Letter[randomIntDistribution(randomGenerator)];
        return s; };

    // We want to have unique, not sorted strings. So, use a std::unordered_set
    std::unordered_set<std::string> uniqueStrings{};

    // Now generate the requested numbers of unique strings
    while (uniqueStrings.size() < MaxPairs + 1)
        uniqueStrings.insert(generateString());

    // Move them to a std::vector for random access possibility
    std::vector<std::string> sss(std::make_move_iterator(uniqueStrings.begin()), std::make_move_iterator(uniqueStrings.end()));

    // Now we create a vectore of pair of indices, like 0->1, 1->2, 2->3 and so on
    std::vector<std::pair<int, int>> index(MaxPairs);
    for (int i = 0; i < MaxPairs; ++i) {
        index[i].first = i;
        index[i].second = i + 1;
    }
    // This we will shuffle
    std::shuffle(index.begin(), index.end(), randomGenerator);

    // And then store the data in a text file
    if (std::ofstream ofs{ testFileName }; ofs) {
        for (size_t i = 0; i < MaxPairs; ++i)
            ofs << sss[index[i].first] << ',' << sss[index[i].second] << '\n';
    }
}


struct TrieNode;
using CharacterWithLink = std::unordered_map<char, TrieNode*>;
using LinkIter = CharacterWithLink::iterator;

struct TrieNode {
    const char* related{};
    CharacterWithLink link{};
};

struct Trie {
    TrieNode root{};
    const char* beginOfWord{};
    LinkIter iter{};
    TrieNode* tnp{};
    std::vector< TrieNode> memory{};
    size_t index{};

    static constexpr size_t MaxMemory = static_cast<size_t>((float)MaxPairs * 3.6);
    //static constexpr size_t MaxMemory = static_cast<size_t>((float)MaxPairs * 10);
    Trie() { memory.resize(MaxMemory); }
    ~Trie() {}

    TrieNode* addWord(const char *characterOfWord) {
        tnp = &root;
        beginOfWord = characterOfWord;

        while (*characterOfWord >= 'A') {
            iter = tnp->link.find(*characterOfWord);

            if (iter == tnp->link.end())
                tnp->link[*characterOfWord] = &memory[index++];
            
            tnp = tnp->link[*characterOfWord];
            ++characterOfWord;
        }
        //tnp->related = beginOfWord;
        return tnp;
    }
    const char* find(const char* characterOfWord) {
        tnp = &root;
        const char* result{};
        while (*characterOfWord >= 'A') {
            iter = tnp->link.find(*characterOfWord);

            if (iter == tnp->link.end()) break;

            tnp = tnp->link[*characterOfWord];
            ++characterOfWord;
        }
        result = tnp->related;
        return result;
    }
};

struct BiRelationalTrie {
    Trie left{};
    Trie right{};

    void fill(std::string& text);
};

void BiRelationalTrie::fill(std::string & text) {

    const char* currentCharPtr{ text.data() };
    const char* textEndPtr{ text.data() + text.length() };

    // We want to identify the pair. A pair of a relation consists of a left and right part. Start of course with left
    bool readLeftSideOfRelation = true;

    const char* leftWordPtr{ text.data() };
    const char* rightWordPtr{};

    for(; currentCharPtr < textEndPtr; ++currentCharPtr) {

        // Split citerium
        if (*currentCharPtr < 'A') {

            // Set pointer to one after the delimiting character
            ++currentCharPtr;

            // Either we are just reading the left or the right side
            if (readLeftSideOfRelation)

                // We were reading the left site, now, set the start pointer for the right side
                rightWordPtr = currentCharPtr;
            else
            {
                // Now we have read the right side of the relation. And with that a complete pair
                // Store relation in each direction
                left.addWord(leftWordPtr)->related = rightWordPtr;
                right.addWord(rightWordPtr)->related = leftWordPtr;

                // Set start of next left side
                leftWordPtr = currentCharPtr;
            }
            // Next, we want to read the other part of the relation
            readLeftSideOfRelation = not readLeftSideOfRelation;
        }
    }
    std::cout << left.index << '\n';
}
// A very small speed boost by a bigger input buffer
constexpr size_t ifStreamBufferSize = 1'000'000u;
static char buffer[ifStreamBufferSize];



int main() {
    //createTestData();
    // Start runtime measurement 
    
    
    BiRelationalTrie biRelationalTrie{};

    // Here we will store the sorted result
    std::deque<const char *> result{};

    auto tp1 = std::chrono::high_resolution_clock::now();

    // Open source file and check if it could be opened
    if (std::ifstream ifs{ testFileName }; ifs) {

        // Set bigger file read buffer. Not sure, if it has really an effect.
        ifs.rdbuf()->pubsetbuf(buffer, ifStreamBufferSize);

        // Read complete source file into a string
        std::string text(std::istreambuf_iterator<char>(ifs), {});

        biRelationalTrie.fill(text);



        auto tp3 = std::chrono::high_resolution_clock::now();

        // Set parts of the relation
        const char *related = text.data()+7;
        const char* relatedSave = text.data();

        // Go through the sequence
        for (;;) {
            related = biRelationalTrie.left.find(related);
            if (related == nullptr)
                break;
            result.push_back(related);
        }
        // We reached end, search in other direction
        related = relatedSave;
        for (;;) {
            related = biRelationalTrie.right.find(related);
            if (related == nullptr)
                break;
            result.push_front(related);
        }

        // Show timing result
        auto tp4 = std::chrono::high_resolution_clock::now();
        std::cout << "Sort: " << std::chrono::duration_cast<std::chrono::milliseconds>(tp4 - tp3).count() << "ms\n";
        std::cout << result.size() << '\n';

        auto tp2 = std::chrono::high_resolution_clock::now();
        std::cout << "Overall: " << std::chrono::duration_cast<std::chrono::milliseconds>(tp2 - tp1).count() << "ms\n";
    }
}

~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文