为什么在这种情况下使用 AsParallel() 比 foreach 慢？

发布于 2024-12-01 08:01:56 字数 8325 浏览 0 评论 0原文

我正在从 Excel 中提取这种格式的数据

 product1   | unnamedcol2 | product2  | unnamedcol4 | product3  | unnamedcol6 |
-------------------------------------------------------------------------------
 @1foo      |        1.10 | @1foo     |         0.3 | @1foo     |         0.3
 @2foo      |        1.00 | @2foo     |           2 | @2foo     |
 @3foo      |        1.52 | @3foo     |        2.53 | @3foo     |
 @4foo      |        1.47 |           |             | @4foo     |        1.31
 @5foo      |        1.49 |           |             | @5foo     |        1.31

该文件使用所有 255 个字段。使用 dapper-dot-net 我通过这段代码获取数据

IEnumerable<IDictionary<string, object>> excelDataRaw =
                conn.Query(string.Format("select * from {0}", table)).Cast<IDictionary<string, object>>();

，并将这些数据传递给这些测试方法。数据以 IDictionaries 的 IEnumerable 形式返回，其中每个键是一个产品，每个值都是一个 IDictionary，其中每个键是产品列中的值，相应的值是产品列右侧的 unnamedcol 中的值。

var excelDataRefined = new List<IDictionary<string, IDictionary<string, decimal>>>();
excelDataRefined.Add(new Dictionary<string, IDictionary<string, decimal>>());
excelDataRefined[0].Add( "product", new Dictionary<string, decimal>());
excelDataRefined[0]["product"].Add("@1foo", 1.1m);

方法：

private static Dictionary<string, IDictionary<string, decimal>> Benchmark_foreach(IEnumerable<IDictionary<string, object>> excelDataRaw)
{
    Console.WriteLine("1. Using foreach");
    var watch = new Stopwatch();
    watch.Start();

    List<string> headers = excelDataRaw.Select(dictionary => dictionary.Keys).First().ToList();
    bool isEven = false;
    List<string> products = headers.Where(h => isEven = !isEven).ToList();
    var dates = new List<IEnumerable<object>>();
    var prices = new List<IEnumerable<object>>();

    foreach (string field in headers)
    {
        string product1 = field;
        if (headers.IndexOf(field) % 2 == 0)
        {
            dates.Add(
                excelDataRaw.AsParallel().AsOrdered().Select(col => col[product1]).Where(row => row != null));
        }

        if (headers.IndexOf(field) % 2 == 1)
        {
            prices.Add(
                excelDataRaw.AsParallel().AsOrdered().Select(col => col[product1] ?? 0m).Take(dates.Last().Count()));
        }
    }

    watch.Stop();
    Console.WriteLine("Rearange the data in: {0}s", watch.Elapsed.TotalSeconds);
    watch.Restart();

    var excelDataRefined = new Dictionary<string, IDictionary<string, decimal>>();
    foreach (IEnumerable<object> datelist in dates)
    {
        decimal num;
        IEnumerable<object> datelist1 = datelist;
        IEnumerable<object> pricelist =
            prices[dates.IndexOf(datelist1)].Select(value => value ?? 0m).Where(
                content => decimal.TryParse(content.ToString(), out num));
        Dictionary<string, decimal> dict =
            datelist1.Zip(pricelist, (k, v) => new { k, v }).ToDictionary(
                x => (string)x.k, x => decimal.Parse(x.v.ToString()));

        if (!excelDataRefined.ContainsKey(products[dates.IndexOf(datelist1)]))
        {
            excelDataRefined.Add(products[dates.IndexOf(datelist1)], dict);
        }
    }

    watch.Stop();
    Console.WriteLine("Zipped the data in: {0}s", watch.Elapsed.TotalSeconds);

    return excelDataRefined;
}

private static Dictionary<string, IDictionary<string, decimal>> Benchmark_AsParallel(IEnumerable<IDictionary<string, object>> excelDataRaw)
{
    Console.WriteLine("2. Using AsParallel().AsOrdered().ForAll");
    var watch = new Stopwatch();
    watch.Start();

    List<string> headers = excelDataRaw.Select(dictionary => dictionary.Keys).First().ToList();
    bool isEven = false;
    List<string> products = headers.Where(h => isEven = !isEven).ToList();
    var dates = new List<IEnumerable<object>>();
    var prices = new List<IEnumerable<object>>();

    headers.AsParallel().AsOrdered().ForAll(
        field =>
        dates.Add(
            excelDataRaw.AsParallel().AsOrdered().TakeWhile(x => headers.IndexOf(field) % 2 == 0).Select(
                col => col[field]).Where(row => row != null).ToList()));
    headers.AsParallel().AsOrdered().ForAll(
        field =>
        prices.Add(
            excelDataRaw.AsParallel().AsOrdered().TakeWhile(x => headers.IndexOf(field) % 2 == 1).Select(
                col => col[field] ?? 0m).Take(256).ToList()));
    dates.RemoveAll(x => x.Count() == 0);
    prices.RemoveAll(x => x.Count() == 0);

    watch.Stop();
    Console.WriteLine("Rearange the data in: {0}s", watch.Elapsed.TotalSeconds);
    watch.Restart();

    var excelDataRefined = new Dictionary<string, IDictionary<string, decimal>>();
    foreach (IEnumerable<object> datelist in dates)
    {
        decimal num;
        IEnumerable<object> datelist1 = datelist;
        IEnumerable<object> pricelist =
            prices[dates.IndexOf(datelist1)].Select(value => value ?? 0m).Where(
                content => decimal.TryParse(content.ToString(), out num));
        Dictionary<string, decimal> dict =
            datelist1.Zip(pricelist, (k, v) => new { k, v }).ToDictionary(
                x => (string)x.k, x => decimal.Parse(x.v.ToString()));

        if (!excelDataRefined.ContainsKey(products[dates.IndexOf(datelist1)]))
        {
            excelDataRefined.Add(products[dates.IndexOf(datelist1)], dict);
        }
    }

    watch.Stop();
    Console.WriteLine("Zipped the data in: {0}s", watch.Elapsed.TotalSeconds);

    return excelDataRefined;
}

private static Dictionary<string, IDictionary<string, decimal>> Benchmark_ForEach(IEnumerable<IDictionary<string, object>> excelDataRaw)
{
    Console.WriteLine("3. Using ForEach");
    var watch = new Stopwatch();
    watch.Start();

    List<string> headers = excelDataRaw.Select(dictionary => dictionary.Keys).First().ToList();
    bool isEven = false;
    List<string> products = headers.Where(h => isEven = !isEven).ToList();
    var dates = new List<IEnumerable<object>>();
    var prices = new List<IEnumerable<object>>();

    headers.ForEach(
        field =>
        dates.Add(
            excelDataRaw.TakeWhile(x => headers.IndexOf(field) % 2 == 0).Select(col => col[field]).Where(
                row => row != null).ToList()));
    headers.ForEach(
        field =>
        prices.Add(
            excelDataRaw.TakeWhile(x => headers.IndexOf(field) % 2 == 1).Select(col => col[field] ?? 0m).
            Take(256).ToList()));
    dates.RemoveAll(x => x.Count() == 0);
    prices.RemoveAll(x => x.Count() == 0);

    watch.Stop();
    Console.WriteLine("Rearange the data in: {0}s", watch.Elapsed.TotalSeconds);
    watch.Restart();

    var excelDataRefined = new Dictionary<string, IDictionary<string, decimal>>();
    foreach (IEnumerable<object> datelist in dates)
    {
        decimal num;
        IEnumerable<object> datelist1 = datelist;
        IEnumerable<object> pricelist =
            prices[dates.IndexOf(datelist1)].Select(value => value ?? 0m).Where(
                content => decimal.TryParse(content.ToString(), out num));
        Dictionary<string, decimal> dict =
            datelist1.Zip(pricelist, (k, v) => new { k, v }).ToDictionary(
                x => (string)x.k, x => decimal.Parse(x.v.ToString()));

        if (!excelDataRefined.ContainsKey(products[dates.IndexOf(datelist1)]))
        {
            excelDataRefined.Add(products[dates.IndexOf(datelist1)], dict);
        }
    }

    watch.Stop();
    Console.WriteLine("Zipped the data in: {0}s", watch.Elapsed.TotalSeconds);

    return excelDataRefined;
}

Benchmark_foreach 需要 app. 3,5 秒重新排列数据，3 秒压缩数据。
Benchmark_AsParallel 需要应用程序。重新排列数据需要 12 秒，压缩数据需要 0,005 秒。
Benchmark_ForEach 需要应用程序。重新排列数据需要 16 秒，压缩数据需要 0,005 秒。

为什么它会这样？我预计 AsParallel 是最快的，因为它并行执行而不是顺序执行。我该如何优化这个？

原文

I am extracting data from excel that is in this format

 product1   | unnamedcol2 | product2  | unnamedcol4 | product3  | unnamedcol6 |
-------------------------------------------------------------------------------
 @1foo      |        1.10 | @1foo     |         0.3 | @1foo     |         0.3
 @2foo      |        1.00 | @2foo     |           2 | @2foo     |
 @3foo      |        1.52 | @3foo     |        2.53 | @3foo     |
 @4foo      |        1.47 |           |             | @4foo     |        1.31
 @5foo      |        1.49 |           |             | @5foo     |        1.31

The file uses all 255 fields. Using dapper-dot-net i get the data through this code

IEnumerable<IDictionary<string, object>> excelDataRaw =
                conn.Query(string.Format("select * from {0}", table)).Cast<IDictionary<string, object>>();

I pass this data to these test methods. The data is returned as an IEnumerable of IDictionaries where each key is a product and each value is an IDictionary where each key is a value from the product column and the corresponding value is a value from unnamedcol that is to the right of the product column.

var excelDataRefined = new List<IDictionary<string, IDictionary<string, decimal>>>();
excelDataRefined.Add(new Dictionary<string, IDictionary<string, decimal>>());
excelDataRefined[0].Add( "product", new Dictionary<string, decimal>());
excelDataRefined[0]["product"].Add("@1foo", 1.1m);

The methods:

private static Dictionary<string, IDictionary<string, decimal>> Benchmark_foreach(IEnumerable<IDictionary<string, object>> excelDataRaw)
{
    Console.WriteLine("1. Using foreach");
    var watch = new Stopwatch();
    watch.Start();

    List<string> headers = excelDataRaw.Select(dictionary => dictionary.Keys).First().ToList();
    bool isEven = false;
    List<string> products = headers.Where(h => isEven = !isEven).ToList();
    var dates = new List<IEnumerable<object>>();
    var prices = new List<IEnumerable<object>>();

    foreach (string field in headers)
    {
        string product1 = field;
        if (headers.IndexOf(field) % 2 == 0)
        {
            dates.Add(
                excelDataRaw.AsParallel().AsOrdered().Select(col => col[product1]).Where(row => row != null));
        }

        if (headers.IndexOf(field) % 2 == 1)
        {
            prices.Add(
                excelDataRaw.AsParallel().AsOrdered().Select(col => col[product1] ?? 0m).Take(dates.Last().Count()));
        }
    }

    watch.Stop();
    Console.WriteLine("Rearange the data in: {0}s", watch.Elapsed.TotalSeconds);
    watch.Restart();

    var excelDataRefined = new Dictionary<string, IDictionary<string, decimal>>();
    foreach (IEnumerable<object> datelist in dates)
    {
        decimal num;
        IEnumerable<object> datelist1 = datelist;
        IEnumerable<object> pricelist =
            prices[dates.IndexOf(datelist1)].Select(value => value ?? 0m).Where(
                content => decimal.TryParse(content.ToString(), out num));
        Dictionary<string, decimal> dict =
            datelist1.Zip(pricelist, (k, v) => new { k, v }).ToDictionary(
                x => (string)x.k, x => decimal.Parse(x.v.ToString()));

        if (!excelDataRefined.ContainsKey(products[dates.IndexOf(datelist1)]))
        {
            excelDataRefined.Add(products[dates.IndexOf(datelist1)], dict);
        }
    }

    watch.Stop();
    Console.WriteLine("Zipped the data in: {0}s", watch.Elapsed.TotalSeconds);

    return excelDataRefined;
}

private static Dictionary<string, IDictionary<string, decimal>> Benchmark_AsParallel(IEnumerable<IDictionary<string, object>> excelDataRaw)
{
    Console.WriteLine("2. Using AsParallel().AsOrdered().ForAll");
    var watch = new Stopwatch();
    watch.Start();

    List<string> headers = excelDataRaw.Select(dictionary => dictionary.Keys).First().ToList();
    bool isEven = false;
    List<string> products = headers.Where(h => isEven = !isEven).ToList();
    var dates = new List<IEnumerable<object>>();
    var prices = new List<IEnumerable<object>>();

    headers.AsParallel().AsOrdered().ForAll(
        field =>
        dates.Add(
            excelDataRaw.AsParallel().AsOrdered().TakeWhile(x => headers.IndexOf(field) % 2 == 0).Select(
                col => col[field]).Where(row => row != null).ToList()));
    headers.AsParallel().AsOrdered().ForAll(
        field =>
        prices.Add(
            excelDataRaw.AsParallel().AsOrdered().TakeWhile(x => headers.IndexOf(field) % 2 == 1).Select(
                col => col[field] ?? 0m).Take(256).ToList()));
    dates.RemoveAll(x => x.Count() == 0);
    prices.RemoveAll(x => x.Count() == 0);

    watch.Stop();
    Console.WriteLine("Rearange the data in: {0}s", watch.Elapsed.TotalSeconds);
    watch.Restart();

    var excelDataRefined = new Dictionary<string, IDictionary<string, decimal>>();
    foreach (IEnumerable<object> datelist in dates)
    {
        decimal num;
        IEnumerable<object> datelist1 = datelist;
        IEnumerable<object> pricelist =
            prices[dates.IndexOf(datelist1)].Select(value => value ?? 0m).Where(
                content => decimal.TryParse(content.ToString(), out num));
        Dictionary<string, decimal> dict =
            datelist1.Zip(pricelist, (k, v) => new { k, v }).ToDictionary(
                x => (string)x.k, x => decimal.Parse(x.v.ToString()));

        if (!excelDataRefined.ContainsKey(products[dates.IndexOf(datelist1)]))
        {
            excelDataRefined.Add(products[dates.IndexOf(datelist1)], dict);
        }
    }

    watch.Stop();
    Console.WriteLine("Zipped the data in: {0}s", watch.Elapsed.TotalSeconds);

    return excelDataRefined;
}

private static Dictionary<string, IDictionary<string, decimal>> Benchmark_ForEach(IEnumerable<IDictionary<string, object>> excelDataRaw)
{
    Console.WriteLine("3. Using ForEach");
    var watch = new Stopwatch();
    watch.Start();

    List<string> headers = excelDataRaw.Select(dictionary => dictionary.Keys).First().ToList();
    bool isEven = false;
    List<string> products = headers.Where(h => isEven = !isEven).ToList();
    var dates = new List<IEnumerable<object>>();
    var prices = new List<IEnumerable<object>>();

    headers.ForEach(
        field =>
        dates.Add(
            excelDataRaw.TakeWhile(x => headers.IndexOf(field) % 2 == 0).Select(col => col[field]).Where(
                row => row != null).ToList()));
    headers.ForEach(
        field =>
        prices.Add(
            excelDataRaw.TakeWhile(x => headers.IndexOf(field) % 2 == 1).Select(col => col[field] ?? 0m).
            Take(256).ToList()));
    dates.RemoveAll(x => x.Count() == 0);
    prices.RemoveAll(x => x.Count() == 0);

    watch.Stop();
    Console.WriteLine("Rearange the data in: {0}s", watch.Elapsed.TotalSeconds);
    watch.Restart();

    var excelDataRefined = new Dictionary<string, IDictionary<string, decimal>>();
    foreach (IEnumerable<object> datelist in dates)
    {
        decimal num;
        IEnumerable<object> datelist1 = datelist;
        IEnumerable<object> pricelist =
            prices[dates.IndexOf(datelist1)].Select(value => value ?? 0m).Where(
                content => decimal.TryParse(content.ToString(), out num));
        Dictionary<string, decimal> dict =
            datelist1.Zip(pricelist, (k, v) => new { k, v }).ToDictionary(
                x => (string)x.k, x => decimal.Parse(x.v.ToString()));

        if (!excelDataRefined.ContainsKey(products[dates.IndexOf(datelist1)]))
        {
            excelDataRefined.Add(products[dates.IndexOf(datelist1)], dict);
        }
    }

    watch.Stop();
    Console.WriteLine("Zipped the data in: {0}s", watch.Elapsed.TotalSeconds);

    return excelDataRefined;
}

Benchmark_foreach needs app. 3,5s to rearrange and 3s to zip the data.
Benchmark_AsParallel needs app. 12s to rearrange and 0,005s to zip the data.
Benchmark_ForEach needs app. 16s to rearrange and 0,005s to zip the data.

Why does it behave like this? I expected AsParallel to be the fastest because it executes in parallel instead of sequential. Ho do i optimize this?

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

浅唱々樱花落 2024-12-08 08:01:56

为了进行并行计算，您必须拥有多个处理器或核心，否则您只是在线程池中排队等待 CPU 的任务。即单核机器上的AsParallel是顺序加上线程池和线程上下文切换的开销。即使在双核机器上，您也可能无法同时获得两个核心，因为许多其他东西都在同一台机器上运行。

实际上，.AsParallel() 仅在您有长时间运行且具有阻塞操作 (I/O) 的任务时才变得有用，其中操作系统可以挂起阻塞线程并让另一个线程运行。

回复收藏 0 原文

魂牵梦绕锁你心扉 2024-12-08 08:01:56

创建附加线程并管理每个线程的工作负载会产生开销。如果您的工作量有限，那么创建额外线程、线程之间的任务切换、线程之间的工作窃取和重新分配等的开销可能会超过通过首先并行化工作所获得的收益。您可能想要分析您的应用程序，以了解在使用单个进程运行时是否确实受 CPU 限制。如果不是，最好保持单线程，这样你的瓶颈就变成了 IO，这不容易并行化。

一些额外的建议：您将看到使用 AsOrdered 和 TakeWhile 会带来性能损失，因为它们都需要同步回原始线程。考虑在不需要订购的情况下进行分析，看看这是否会带来任何性能改进。

另外，请考虑使用 ConcurrentDictionary 而不是标准通用字典，以避免添加项目时出现并发问题。

回复收藏 0 原文