使用预先计算的字典对小块进行无损压缩

发布于 2024-07-26 03:24:03 字数 7505 浏览 3 评论 0 原文

我有一个应用程序，我正在其中读取和写入小数据块（几百字节）数亿次。我想根据示例数据文件生成一个压缩字典，并在读取和写入小块时永远使用该字典。我倾向于 LZW 压缩算法。维基百科页面 (http://en.wikipedia.org/wiki/Lempel-Ziv-Welch ）列出了压缩和解压缩的伪代码。修改它看起来相当简单，使得字典创建是一个单独的代码块。所以我有两个问题：

我走在正确的道路上还是有更好的方法？
为什么LZW算法在解压步骤中要向字典中添加内容？我可以省略它吗，或者我的字典会失去效率吗？

谢谢。

更新：现在我认为理想的情况是找到一个库，让我可以将字典与压缩数据分开存储。有这样的东西存在吗？

更新：我最终在http://www.enusbaum.com/blog/2009/05/22/example-huffman-compression-routine-in-c 并进行调整。我是该页面评论中的克里斯。我通过电子邮件将我的模组发回给该博客作者，但尚未收到回复。我在该代码中看到的压缩率一点也不令人印象深刻。也许这是由于 8 位树大小所致。

更新：我将其转换为 16 位，压缩效果更好。它也比原始代码快得多。

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;

namespace Book.Core
{
  public class Huffman16
  {
    private readonly double log2 = Math.Log(2);

    private List<Node> HuffmanTree = new List<Node>();

    internal class Node
    {
      public long Frequency { get; set; }
      public byte Uncoded0 { get; set; }
      public byte Uncoded1 { get; set; }
      public uint Coded { get; set; }
      public int CodeLength { get; set; }
      public Node Left { get; set; }
      public Node Right { get; set; }

      public bool IsLeaf
      {
        get { return Left == null; }
      }

      public override string ToString()
      {
        var coded = "00000000" + Convert.ToString(Coded, 2);
        return string.Format("Uncoded={0}, Coded={1}, Frequency={2}", (Uncoded1 << 8) | Uncoded0, coded.Substring(coded.Length - CodeLength), Frequency);
      }
    }

    public Huffman16(long[] frequencies)
    {
      if (frequencies.Length != ushort.MaxValue + 1)
      {
        throw new ArgumentException("frequencies.Length must equal " + ushort.MaxValue + 1);
      }
      BuildTree(frequencies);
      EncodeTree(HuffmanTree[HuffmanTree.Count - 1], 0, 0);
    }

    public static long[] GetFrequencies(byte[] sampleData, bool safe)
    {
      if (sampleData.Length % 2 != 0)
      {
        throw new ArgumentException("sampleData.Length must be a multiple of 2.");
      }
      var histogram = new long[ushort.MaxValue + 1];
      if (safe)
      {
        for (int i = 0; i <= ushort.MaxValue; i++)
        {
          histogram[i] = 1;
        }
      }
      for (int i = 0; i < sampleData.Length; i += 2)
      {
        histogram[(sampleData[i] << 8) | sampleData[i + 1]] += 1000;
      }
      return histogram;
    }

    public byte[] Encode(byte[] plainData)
    {
      if (plainData.Length % 2 != 0)
      {
        throw new ArgumentException("plainData.Length must be a multiple of 2.");
      }

      Int64 iBuffer = 0;
      int iBufferCount = 0;

      using (MemoryStream msEncodedOutput = new MemoryStream())
      {
        //Write Final Output Size 1st
        msEncodedOutput.Write(BitConverter.GetBytes(plainData.Length), 0, 4);

        //Begin Writing Encoded Data Stream
        iBuffer = 0;
        iBufferCount = 0;
        for (int i = 0; i < plainData.Length; i += 2)
        {
          Node FoundLeaf = HuffmanTree[(plainData[i] << 8) | plainData[i + 1]];

          //How many bits are we adding?
          iBufferCount += FoundLeaf.CodeLength;

          //Shift the buffer
          iBuffer = (iBuffer << FoundLeaf.CodeLength) | FoundLeaf.Coded;

          //Are there at least 8 bits in the buffer?
          while (iBufferCount > 7)
          {
            //Write to output
            int iBufferOutput = (int)(iBuffer >> (iBufferCount - 8));
            msEncodedOutput.WriteByte((byte)iBufferOutput);
            iBufferCount = iBufferCount - 8;
            iBufferOutput <<= iBufferCount;
            iBuffer ^= iBufferOutput;
          }
        }

        //Write remaining bits in buffer
        if (iBufferCount > 0)
        {
          iBuffer = iBuffer << (8 - iBufferCount);
          msEncodedOutput.WriteByte((byte)iBuffer);
        }
        return msEncodedOutput.ToArray();
      }
    }

    public byte[] Decode(byte[] bInput)
    {
      long iInputBuffer = 0;
      int iBytesWritten = 0;

      //Establish Output Buffer to write unencoded data to
      byte[] bDecodedOutput = new byte[BitConverter.ToInt32(bInput, 0)];

      var current = HuffmanTree[HuffmanTree.Count - 1];

      //Begin Looping through Input and Decoding
      iInputBuffer = 0;
      for (int i = 4; i < bInput.Length; i++)
      {
        iInputBuffer = bInput[i];

        for (int bit = 0; bit < 8; bit++)
        {
          if ((iInputBuffer & 128) == 0)
          {
            current = current.Left;
          }
          else
          {
            current = current.Right;
          }
          if (current.IsLeaf)
          {
            bDecodedOutput[iBytesWritten++] = current.Uncoded1;
            bDecodedOutput[iBytesWritten++] = current.Uncoded0;
            if (iBytesWritten == bDecodedOutput.Length)
            {
              return bDecodedOutput;
            }
            current = HuffmanTree[HuffmanTree.Count - 1];
          }
          iInputBuffer <<= 1;
        }
      }
      throw new Exception();
    }

    private static void EncodeTree(Node node, int depth, uint value)
    {
      if (node != null)
      {
        if (node.IsLeaf)
        {
          node.CodeLength = depth;
          node.Coded = value;
        }
        else
        {
          depth++;
          value <<= 1;
          EncodeTree(node.Left, depth, value);
          EncodeTree(node.Right, depth, value | 1);
        }
      }
    }

    private void BuildTree(long[] frequencies)
    {
      var tiny = 0.1 / ushort.MaxValue;
      var fraction = 0.0;

      SortedDictionary<double, Node> trees = new SortedDictionary<double, Node>();
      for (int i = 0; i <= ushort.MaxValue; i++)
      {
        var leaf = new Node()
        {
          Uncoded1 = (byte)(i >> 8),
          Uncoded0 = (byte)(i & 255),
          Frequency = frequencies[i]
        };
        HuffmanTree.Add(leaf);
        if (leaf.Frequency > 0)
        {
          trees.Add(leaf.Frequency + (fraction += tiny), leaf);
        }
      }

      while (trees.Count > 1)
      {
        var e = trees.GetEnumerator();
        e.MoveNext();
        var first = e.Current;
        e.MoveNext();
        var second = e.Current;

        //Join smallest two nodes
        var NewParent = new Node();
        NewParent.Frequency = first.Value.Frequency + second.Value.Frequency;
        NewParent.Left = first.Value;
        NewParent.Right = second.Value;

        HuffmanTree.Add(NewParent);

        //Remove the two that just got joined into one
        trees.Remove(first.Key);
        trees.Remove(second.Key);

        trees.Add(NewParent.Frequency + (fraction += tiny), NewParent);
      }
    }

  }

}

使用示例：

从样本数据创建字典：

var freqs = Huffman16.GetFrequencies(File.ReadAllBytes(@"D:\nodes"), true);

使用给定字典初始化编码器：

var huff = new Huffman16(freqs);

并进行一些压缩：

var encoded = huff.Encode(raw);

和解压缩：

var raw = huff.Decode(encoded);

原文

I have an application where I am reading and writing small blocks of data (a few hundred bytes) hundreds of millions of times. I'd like to generate a compression dictionary based on an example data file and use that dictionary forever as I read and write the small blocks. I'm leaning toward the LZW compression algorithm. The Wikipedia page (http://en.wikipedia.org/wiki/Lempel-Ziv-Welch) lists pseudocode for compression and decompression. It looks fairly straightforward to modify it such that the dictionary creation is a separate block of code. So I have two questions:

Am I on the right track or is there a better way?
Why does the LZW algorithm add to the dictionary during the decompression step? Can I omit that, or would I lose efficiency in my dictionary?

Thanks.

Update: Now I'm thinking the ideal case be to find a library that lets me store the dictionary separate from the compressed data. Does anything like that exist?

Update: I ended up taking the code at http://www.enusbaum.com/blog/2009/05/22/example-huffman-compression-routine-in-c and adapting it. I am Chris in the comments on that page. I emailed my mods back to that blog author, but I haven't heard back yet. The compression rates I'm seeing with that code are not at all impressive. Maybe that is due to the 8-bit tree size.

Update: I converted it to 16 bits and the compression is better. It's also much faster than the original code.

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;

namespace Book.Core
{
  public class Huffman16
  {
    private readonly double log2 = Math.Log(2);

    private List<Node> HuffmanTree = new List<Node>();

    internal class Node
    {
      public long Frequency { get; set; }
      public byte Uncoded0 { get; set; }
      public byte Uncoded1 { get; set; }
      public uint Coded { get; set; }
      public int CodeLength { get; set; }
      public Node Left { get; set; }
      public Node Right { get; set; }

      public bool IsLeaf
      {
        get { return Left == null; }
      }

      public override string ToString()
      {
        var coded = "00000000" + Convert.ToString(Coded, 2);
        return string.Format("Uncoded={0}, Coded={1}, Frequency={2}", (Uncoded1 << 8) | Uncoded0, coded.Substring(coded.Length - CodeLength), Frequency);
      }
    }

    public Huffman16(long[] frequencies)
    {
      if (frequencies.Length != ushort.MaxValue + 1)
      {
        throw new ArgumentException("frequencies.Length must equal " + ushort.MaxValue + 1);
      }
      BuildTree(frequencies);
      EncodeTree(HuffmanTree[HuffmanTree.Count - 1], 0, 0);
    }

    public static long[] GetFrequencies(byte[] sampleData, bool safe)
    {
      if (sampleData.Length % 2 != 0)
      {
        throw new ArgumentException("sampleData.Length must be a multiple of 2.");
      }
      var histogram = new long[ushort.MaxValue + 1];
      if (safe)
      {
        for (int i = 0; i <= ushort.MaxValue; i++)
        {
          histogram[i] = 1;
        }
      }
      for (int i = 0; i < sampleData.Length; i += 2)
      {
        histogram[(sampleData[i] << 8) | sampleData[i + 1]] += 1000;
      }
      return histogram;
    }

    public byte[] Encode(byte[] plainData)
    {
      if (plainData.Length % 2 != 0)
      {
        throw new ArgumentException("plainData.Length must be a multiple of 2.");
      }

      Int64 iBuffer = 0;
      int iBufferCount = 0;

      using (MemoryStream msEncodedOutput = new MemoryStream())
      {
        //Write Final Output Size 1st
        msEncodedOutput.Write(BitConverter.GetBytes(plainData.Length), 0, 4);

        //Begin Writing Encoded Data Stream
        iBuffer = 0;
        iBufferCount = 0;
        for (int i = 0; i < plainData.Length; i += 2)
        {
          Node FoundLeaf = HuffmanTree[(plainData[i] << 8) | plainData[i + 1]];

          //How many bits are we adding?
          iBufferCount += FoundLeaf.CodeLength;

          //Shift the buffer
          iBuffer = (iBuffer << FoundLeaf.CodeLength) | FoundLeaf.Coded;

          //Are there at least 8 bits in the buffer?
          while (iBufferCount > 7)
          {
            //Write to output
            int iBufferOutput = (int)(iBuffer >> (iBufferCount - 8));
            msEncodedOutput.WriteByte((byte)iBufferOutput);
            iBufferCount = iBufferCount - 8;
            iBufferOutput <<= iBufferCount;
            iBuffer ^= iBufferOutput;
          }
        }

        //Write remaining bits in buffer
        if (iBufferCount > 0)
        {
          iBuffer = iBuffer << (8 - iBufferCount);
          msEncodedOutput.WriteByte((byte)iBuffer);
        }
        return msEncodedOutput.ToArray();
      }
    }

    public byte[] Decode(byte[] bInput)
    {
      long iInputBuffer = 0;
      int iBytesWritten = 0;

      //Establish Output Buffer to write unencoded data to
      byte[] bDecodedOutput = new byte[BitConverter.ToInt32(bInput, 0)];

      var current = HuffmanTree[HuffmanTree.Count - 1];

      //Begin Looping through Input and Decoding
      iInputBuffer = 0;
      for (int i = 4; i < bInput.Length; i++)
      {
        iInputBuffer = bInput[i];

        for (int bit = 0; bit < 8; bit++)
        {
          if ((iInputBuffer & 128) == 0)
          {
            current = current.Left;
          }
          else
          {
            current = current.Right;
          }
          if (current.IsLeaf)
          {
            bDecodedOutput[iBytesWritten++] = current.Uncoded1;
            bDecodedOutput[iBytesWritten++] = current.Uncoded0;
            if (iBytesWritten == bDecodedOutput.Length)
            {
              return bDecodedOutput;
            }
            current = HuffmanTree[HuffmanTree.Count - 1];
          }
          iInputBuffer <<= 1;
        }
      }
      throw new Exception();
    }

    private static void EncodeTree(Node node, int depth, uint value)
    {
      if (node != null)
      {
        if (node.IsLeaf)
        {
          node.CodeLength = depth;
          node.Coded = value;
        }
        else
        {
          depth++;
          value <<= 1;
          EncodeTree(node.Left, depth, value);
          EncodeTree(node.Right, depth, value | 1);
        }
      }
    }

    private void BuildTree(long[] frequencies)
    {
      var tiny = 0.1 / ushort.MaxValue;
      var fraction = 0.0;

      SortedDictionary<double, Node> trees = new SortedDictionary<double, Node>();
      for (int i = 0; i <= ushort.MaxValue; i++)
      {
        var leaf = new Node()
        {
          Uncoded1 = (byte)(i >> 8),
          Uncoded0 = (byte)(i & 255),
          Frequency = frequencies[i]
        };
        HuffmanTree.Add(leaf);
        if (leaf.Frequency > 0)
        {
          trees.Add(leaf.Frequency + (fraction += tiny), leaf);
        }
      }

      while (trees.Count > 1)
      {
        var e = trees.GetEnumerator();
        e.MoveNext();
        var first = e.Current;
        e.MoveNext();
        var second = e.Current;

        //Join smallest two nodes
        var NewParent = new Node();
        NewParent.Frequency = first.Value.Frequency + second.Value.Frequency;
        NewParent.Left = first.Value;
        NewParent.Right = second.Value;

        HuffmanTree.Add(NewParent);

        //Remove the two that just got joined into one
        trees.Remove(first.Key);
        trees.Remove(second.Key);

        trees.Add(NewParent.Frequency + (fraction += tiny), NewParent);
      }
    }

  }

}

Usage examples:

To create the dictionary from sample data:

var freqs = Huffman16.GetFrequencies(File.ReadAllBytes(@"D:\nodes"), true);

To initialize an encoder with a given dictionary:

var huff = new Huffman16(freqs);

And to do some compression:

var encoded = huff.Encode(raw);

And decompression:

var raw = huff.Decode(encoded);

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

彼岸花似海 2024-08-02 03:24:03

我认为最困难的部分是如何构建静态字典。您不想使用根据示例数据构建的 LZW 字典。 LZW 浪费了大量的学习时间，因为它无法比解压缩器更快地构建字典（令牌只会在压缩器第二次看到它时使用，因此解压缩器可以在第一次看到它时将其添加到其字典中）。另一方面是，它向字典中添加了可能永远不会被使用的内容，以防万一该字符串再次出现。（例如，要拥有“stackoverflow”的令牌，您还将拥有“ac”、“ko”、“ve”、“rf”等条目...）

但是，查看来自 LZ77 算法的原始令牌流可以很好地工作。您只会看到至少出现两次的字符串的标记。然后，您可以构建要包含在字典中的最常见标记/字符串的列表。

一旦你有了静态字典，使用 LZW sans 字典更新似乎是一个简单的实现，但为了获得最佳压缩，我会考虑静态 Huffman 表而不是传统的 12 位固定大小令牌（正如 George Phillips 所建议的那样）。 LZW 字典将为您可能永远不会实际编码的所有子字符串烧毁标记（例如，如果您可以编码“stackoverflow”，则会有“st”、“sta”、“stac”、“stack”、“堆栈'等）。

在这一点上，它确实不是 LZW - LZW 的聪明之处在于解压缩器如何构建与压缩器仅查看压缩数据流所使用的相同字典。你不会使用的东西。但是所有 LZW 实现都有一个字典已满且不再更新的状态，这就是您将其与静态字典一起使用的方式。