哈夫曼编码与解码
这是我的第一篇博客,希望大神们批评指正。
首先介绍以下什么是哈夫曼树(来自百度百科)
哈夫曼树─即最优二叉树,带权路径长度最小的二叉树,经常应用于数据压缩。 在计算机信息处理中,“哈夫曼编码”是一种一致性编码法(又称“熵编码法”),用于数据的无损耗压缩。这一术语是指使用一张特殊的编码表将源字符(例如某文件中的一个符号)进行编码。这张编码表的特殊之处在于,它是根据每一个源字符出现的估算概率而建立起来的(出现概率高的字符使用较短的编码,反之出现概率低的则使用较长的编码,这便使编码之后的字符串的平均期望长度降低,从而达到无损压缩数据的目的)。
构造哈夫曼树的主要思想:
构造哈夫曼树非常简单,将所有的节点放到一个队列中,用一个节点替换两个频率最低的节点,新节点的频率就是这两个节点的频率之和。这样,新节点就是两个被替换节点的父节点了。如此循环,直到队列中只剩一个节点(树根)。
这里用到了最小优先队列。
我这里用STL来实现。(这里有优先队列的介绍)
- template<typename T>
- struct cmp
- {
- bool operator()(TreeNode<T>* t1, TreeNode<T>* t2)
- {
- return !(*t1 < *t2);
- }
- };
优先队列的定义:
- priority_queue<TreeNode*,vector<TreeNode* >,cmp > pri_que;
哈夫曼树节点结构
- template<typename T>
- class TreeNode
- {
- public:
- TreeNode():pfather(NULL),plchild(NULL),prchild(NULL)
- {
- }
- T data;
- TreeNode *pfather;
- TreeNode *plchild;
- TreeNode *prchild;
- bool operator < (const TreeNode& rhs)
- {
- return data < rhs.data;
- }
- };
构造哈夫曼树
每次从最小优先队列取头两个节点,合并后放回最小优先队列,如此重复。
- template<typename T>
- TreeNode<T>* MakeHuffmanTree(T* begin, T* end) //构造哈夫曼树
- {
- priority_queue<TreeNode<T>*,vector<TreeNode<T>* >,cmp<T> > pri_que;
- T *iter = begin;
- TreeNode<T>* pNode;
- TreeNode<T>* pf = NULL;
- while(iter != end)
- {
- pNode = new TreeNode<T>;
- pNode->data = *iter++;
- pNode->pfather = pf;
- pri_que.push(pNode);
- }
- TreeNode<T>* plchild;
- TreeNode<T>* prchild;
- while(pri_que.size() > 1)//取两个小的合并
- {
- plchild = pri_que.top();
- pri_que.pop();
- prchild = pri_que.top();
- pri_que.pop();
- pNode = new TreeNode<T>;
- pNode->plchild = plchild;
- pNode->prchild = prchild;
- pNode->data = plchild->data + prchild->data;
- pri_que.push(pNode);
- }
- pNode = pri_que.top();
- pri_que.pop();
- return pNode;
- }
构造哈夫曼树这个函数的参数是一个结构体,保存着对应字符,其频率,编码值。
重载它的+运算符,为了合并时的+运算(只是频率相加)。
到此为止,已经可以把哈夫曼树生成出来了。
- template<typename T>
- struct mydata
- {
- mydata(){}
- mydata(int i):freq(i)
- {
- }
- string coded;
- int freq;
- T data;
- bool operator<(const mydata& rhs)
- {
- return freq < rhs.freq;
- }
- mydata operator+(mydata& rhs)
- {
- return mydata(freq + rhs.freq);
- }
- };
我们可以通过DFS将每个叶子节点的路径记录下来(用一个全局变量数组path),然后得到它的编码。
当发现当前节点是叶子节点,就把当前的路径赋值至该叶子节点的编码属性(coded)。
- const int MAXLEN = 20;
- char path[MAXLEN] = {0};
- template<typename T>
- void DFS(T* root,int deep = -1, char a = \'-\') //DFS 得到叶子节点的编码
- {
- if(root == NULL)
- return;
- if(a != \'-\')
- path[deep] = a;
- if(root->plchild == NULL || root->prchild == NULL)//leaf
- (root->data).coded = string(path,path + deep + 1);
- if(root->plchild != NULL)
- DFS(root->plchild, deep + 1, \'0\');
- if(root->prchild != NULL)
- DFS(root->prchild, deep + 1, \'1\');
- }
这样整个哈夫曼编码工作已经完成,为了查看我们的编码结果,我们可以用BFS跟DFS来看到我们的结果。在这里我选择了BFS。
当遍历到叶子节点,就将其编码属性(coded)和其对应字符输出。
- template<typename T,typename U>
- void BFS(T* root, mydata<U>* data) //BFS 将叶子节点的编码,提到data指向的数据
- {
- queue<T*> que;
- que.push(root);
- T* pT = NULL;
- while(!que.empty())
- {
- pT = que.front();
- //cout<<pT->data.freq<<endl;
- que.pop();
- if(pT->plchild != NULL)
- que.push(pT->plchild);
- if(pT->prchild != NULL)
- que.push(pT->prchild);
- if(pT->plchild == NULL || pT->prchild == NULL)// leaf 提取叶子节点的编码
- {
- //cout<<(pT->data).data<<":"<<(pT->data).coded<<endl;
- mydata<U>* pd = data;
- while((pT->data).data != pd->data)
- pd++;
- assert(pd->data == (pT->data).data);
- pd->coded = (pT->data).coded;
- }
- }
- }
测试驱动代码
- mydata<char> *pdata = new mydata<char>[4];
- pdata[0].data = \'a\';
- pdata[0].freq = 7;
- pdata[1].data = \'b\';
- pdata[1].freq = 5;
- pdata[2].data = \'c\';
- pdata[2].freq = 2;
- pdata[3].data = \'d\';
- pdata[3].freq = 4;
- TreeNode<mydata<char> >* pihuffmanTree = MakeHuffmanTree(pdata, pdata + 4);
- DFS(pihuffmanTree);
- BFS(pihuffmanTree);
为了更方便的使用我将这些封装到一个类里面。
- template<typename T>
- class Huffman
- {
- public:
- void Coded(string& coded);//传入待输出的编码
- void DeCode(const string& codedstr,string& decodestr);//输入待解码字符串,输出解码字符串
- void InputData(T* begin,T* end);//传入数据
- private:
- string FindVal(char c);
- void m_CalcFreq(T* begin, T* end);//计算输入数据的频率
- TreeNode<mydata<T> > *root;//huffman根节点
- mydata<T>* data;
- int data_size;
- T* m_begin;//保存原始数据的开始与结束的位置
- T* m_end;
- //string codedstr;
- };
输入数据并计算其频率。
用map容器来统计输入字符每个出现的个数。
- template<typename T>
- void Huffman<T>::InputData(T* begin, T* end)
- {
- this->m_begin = begin;
- this->m_end = end;
- m_CalcFreq(begin, end);
- }
- template<typename T>
- void Huffman<T>::m_CalcFreq(T* begin, T* end)
- {
- int len = end - begin;
- //data_size = len;
- if(len == 0)
- return;
- map<T,int> countMap;
- map<T,int>::iterator mapIter = countMap.begin();
- T *pT = begin;
- while(pT != end)
- {
- mapIter = countMap.find(*pT);
- if(mapIter != countMap.end())//在map里有没有字符*iter
- ++mapIter->second;
- else
- {
- countMap.insert(make_pair(*pT,1));
- }
- pT++;
- }
- data_size = countMap.size();
- data = new mydata<T>[data_size];
- int i = 0;
- for (mapIter = countMap.begin(); mapIter != countMap.end(); ++mapIter)
- {
- data[i].data = mapIter->first;
- data[i].freq = mapIter->second;
- i++;
- }
- }
编码
- template<typename T>
- void Huffman<T>::Coded(string& coded)
- {
- root = MakeHuffmanTree(data,data + data_size);
- DFS(root);
- BFS(root,data);
- cout<<"code:"<<endl;
- for (int i = 0; i < data_size; ++i)
- {
- cout<<data[i].data<<":"<<data[i].coded<<endl;
- }
- T *begin = m_begin;
- while (begin != m_end)
- {
- coded += FindVal(*begin);
- begin++;
- }
- //string subcode =
- }
解码
- template<typename T>
- void Huffman<T>::DeCode(const string& codedstr,string& decodestr)
- {
- string::const_iterator iter = codedstr.begin();
- TreeNode<mydata<T> >* curNode = root;
- while (iter != codedstr.end())
- {
- if (curNode->plchild == NULL || curNode->prchild == NULL)
- {
- decodestr += (curNode->data).data;
- curNode = root;
- continue;
- }
- if (*iter == \'0\')
- curNode = curNode->plchild;
- if(*iter == \'1\')
- curNode = curNode->prchild;
- iter++;
- }
- }
测试驱动程序
- char *pmystr = "cbcddddbbbbaaaaaaa";
- Huffman<char> h;
- h.InputData(pmystr, pmystr + 18);
- cout<<"originstr: "<<pmystr<<endl;
- string coded;
- h.Coded(coded);
- cout<<"coded: "<<coded<<endl;
- string decode;
- h.DeCode(coded,decode);
- cout<<"decode: "<<decode<<endl;
完整程序(环境:VS2012)
- #include <iostream>
- //#include <algorithm>
- #include <queue>
- #include <string>
- #include <vector>
- #include <cassert>
- #include <map>
- using namespace std;
- template<typename T>
- class TreeNode
- {
- public:
- TreeNode():pfather(NULL),plchild(NULL),prchild(NULL)
- {
- }
- T data;
- TreeNode *pfather;
- TreeNode *plchild;
- TreeNode *prchild;
- bool operator < (const TreeNode& rhs)
- {
- return data < rhs.data;
- }
- };
- template<typename T>
- struct cmp
- {
- bool operator()(TreeNode<T>* t1, TreeNode<T>* t2)
- {
- return !(*t1 < *t2);
- }
- };
- template<typename T>
- TreeNode<T>* MakeHuffmanTree(T* begin, T* end) //构造哈夫曼树
- {
- priority_queue<TreeNode<T>*,vector<TreeNode<T>* >,cmp<T> > pri_que;
- T *iter = begin;
- TreeNode<T>* pNode;
- TreeNode<T>* pf = NULL;
- while(iter != end)
- {
- pNode = new TreeNode<T>;
- pNode->data = *iter++;
- pNode->pfather = pf;
- pri_que.push(pNode);
- }
- TreeNode<T>* plchild;
- TreeNode<T>* prchild;
- while(pri_que.size() > 1)//取两个小的合并
- {
- //cout<<static_cast<TreeNode<T>* >(pri_que.top())->data<<endl;
- //pri_que.pop();
- plchild = pri_que.top();
- pri_que.pop();
- prchild = pri_que.top();
- pri_que.pop();
- pNode = new TreeNode<T>;
- pNode->plchild = plchild;
- pNode->prchild = prchild;
- pNode->data = plchild->data + prchild->data;
- pri_que.push(pNode);
- }
- pNode = pri_que.top();
- pri_que.pop();
- return pNode;
- }
- template<typename T>
- struct mydata
- {
- mydata(){}
- mydata(int i):freq(i)
- {
- }
- string coded;
- int freq;
- T data;
- bool operator<(const mydata& rhs)
- {
- return freq < rhs.freq;
- }
- mydata operator+(mydata& rhs)
- {
- return mydata(freq + rhs.freq);
- }
- };
- template<typename T,typename U>
- void BFS(T* root, mydata<U>* data) //BFS 将叶子节点的编码,提到data指向的数据
- {
- queue<T*> que;
- que.push(root);
- T* pT = NULL;
- while(!que.empty())
- {
- pT = que.front();
- //cout<<pT->data.freq<<endl;
- que.pop();
- if(pT->plchild != NULL)
- que.push(pT->plchild);
- if(pT->prchild != NULL)
- que.push(pT->prchild);
- if(pT->plchild == NULL || pT->prchild == NULL)// leaf 提取叶子节点的编码
- {
- //cout<<(pT->data).data<<":"<<(pT->data).coded<<endl;
- mydata<U>* pd = data;
- while((pT->data).data != pd->data)
- pd++;
- assert(pd->data == (pT->data).data);
- pd->coded = (pT->data).coded;
- }
- }
- }
- const int MAXLEN = 20;
- char path[MAXLEN] = {0};
- template<typename T>
- void DFS(T* root,int deep = -1, char a = \'-\') //DFS 得到叶子节点的编码
- {
- if(root == NULL)
- return;
- if(a != \'-\')
- path[deep] = a;
- if(root->plchild == NULL || root->prchild == NULL)//leaf
- (root->data).coded = string(path,path + deep + 1);
- if(root->plchild != NULL)
- DFS(root->plchild, deep + 1, \'0\');
- if(root->prchild != NULL)
- DFS(root->prchild, deep + 1, \'1\');
- }
- template<typename T>
- class Huffman
- {
- public:
- void Coded(string& coded);
- void DeCode(const string& codedstr,string& decodestr);
- void InputData(T* begin,T* end);
- private:
- string FindVal(char c);
- void m_CalcFreq(T* begin, T* end);
- TreeNode<mydata<T> > *root;
- mydata<T>* data;
- int data_size;
- T* m_begin;
- T* m_end;
- //string codedstr;
- };
- template<typename T>
- void Huffman<T>::InputData(T* begin, T* end)
- {
- this->m_begin = begin;
- this->m_end = end;
- m_CalcFreq(begin, end);
- }
- template<typename T>
- void Huffman<T>::m_CalcFreq(T* begin, T* end)
- {
- int len = end - begin;
- //data_size = len;
- if(len == 0)
- return;
- map<T,int> countMap;
- map<T,int>::iterator mapIter = countMap.begin();
- T *pT = begin;
- while(pT != end)
- {
- mapIter = countMap.find(*pT);
- if(mapIter != countMap.end())
- ++mapIter->second;
- else
- {
- countMap.insert(make_pair(*pT,1));
- }
- pT++;
- }
- data_size = countMap.size();
- data = new mydata<T>[data_size];
- int i = 0;
- for (mapIter = countMap.begin(); mapIter != countMap.end(); ++mapIter)
- {
- data[i].data = mapIter->first;
- data[i].freq = mapIter->second;
- i++;
- }
- }
- template<typename T>
- void Huffman<T>::Coded(string& coded)
- {
- root = MakeHuffmanTree(data,data + data_size);
- DFS(root);
- BFS(root,data);
- cout<<"code:"<<endl;
- for (int i = 0; i < data_size; ++i)
- {
- cout<<data[i].data<<":"<<data[i].coded<<endl;
- }
- T *begin = m_begin;
- while (begin != m_end)
- {
- coded += FindVal(*begin);
- begin++;
- }
- //string subcode =
- }
- template<typename T>
- void Huffman<T>::DeCode(const string& codedstr,string& decodestr)
- {
- string::const_iterator iter = codedstr.begin();
- TreeNode<mydata<T> >* curNode = root;
- while (iter != codedstr.end())
- {
- if (curNode->plchild == NULL || curNode->prchild == NULL)
- {
- decodestr += (curNode->data).data;
- curNode = root;
- continue;
- }
- if (*iter == \'0\')
- curNode = curNode->plchild;
- if(*iter == \'1\')
- curNode = curNode->prchild;
- iter++;
- }
- }
- template<typename T>
- string Huffman<T>::FindVal(char c)
- {
- for (int i = 0; i < data_size ; ++i)
- {
- if (c != data[i].data)
- continue;
- return data[i].coded;
- }
- return string();
- }
- int main()
- {
- //mydata<char> *pdata = new mydata<char>[4];
- //pdata[0].data = \'a\';
- //pdata[0].freq = 7;
- //pdata[1].data = \'b\';
- //pdata[1].freq = 5;
- //pdata[2].data = \'c\';
- //pdata[2].freq = 2;
- //pdata[3].data = \'d\';
- //pdata[3].freq = 4;
- ////int a[12]={14,10,56,7,83,22,36,91,3,47,72,0};
- //TreeNode<mydata<char> >* pihuffmanTree = MakeHuffmanTree(pdata, pdata + 4);
- //DFS(pihuffmanTree);
- //BFS(pihuffmanTree);
- //string str = "cbcddddbbbbaaaaaaa";
- char *pmystr = "cbcddddbbbbaaaaaaa";
- Huffman<char> h;
- h.InputData(pmystr, pmystr + 18);
- cout<<"originstr: "<<pmystr<<endl;
- string coded;
- h.Coded(coded);
- cout<<"coded: "<<coded<<endl;
- string decode;
- h.DeCode(coded,decode);
- cout<<"decode: "<<decode<<endl;
- return 0;
- }