Huffman编码之文件压缩问题
1.实际问题
对于给定的.txt文本文件,用Huffman编码方法对.txt文件进行压缩,并计算压缩比。
2.程序实现
#include<iostream>
#include<iomanip>
#include<fstream>
#include<cstring>
#include<queue>
#include<algorithm>
#include<time.h>
using namespace std;
typedef long long LL;
const int FILE_LENGTH = 1000;
const long long MAX_MEMORY = 3 * 1024 * 1024;//每次读取最大字节数量
const int KIND_OF_CHARACTER = 256;//ASCII字符种类
const int HUFFMAN_CODE_LENGTH = 1000;//Huffman码最大长度
const int OFFSET = 20;//the position of the size of original file in compressed file
const int nBits = 8;//以8比特形式存到压缩文件中
struct Node {
char c; //字符
int parent, lChild, rChild;//父亲节点,左儿子,右儿子
int iNode; //节点序列号
LL number; //对应权重,出现次数
friend bool operator < (Node a, Node b) {
return a.number > b.number;
}
}node[KIND_OF_CHARACTER];
char HuffmanCode[KIND_OF_CHARACTER][HUFFMAN_CODE_LENGTH];
void CountKindOfCharacter();
int BuildHuffmanTree();
void CompressFile(const char *filePath, const char *outPutFilePath, int numberOfNode);
void BitToInt(ofstream &Output, char *HTstr, LL len);
int main() {
char filePath[FILE_LENGTH] = "Aesop_Fables.txt";
char compressFilePath[FILE_LENGTH] = "Aesop_FablesFileCompressResult.txt";
// char filePath[FILE_LENGTH] = "graph.txt";
// char compressFilePath[FILE_LENGTH] = "GraphFileCompressResult.txt";
ifstream readIn;
readIn.open(filePath, ios::binary);
if (readIn.is_open() == 0) {
cout <<"Open failed!" << endl;
exit(0);
}
//计算所文件大小
readIn.seekg(0, ios::end);
LL fileSize = (LL)readIn.tellg(); //文件长度,字符总数量
readIn.seekg(0, ios::beg);
cout<<"fileSize=" <<fileSize<<endl;
//read data in batches, each time read MAX_MEMORY characters
int nTimes = (int)(fileSize / MAX_MEMORY);
if (fileSize % MAX_MEMORY != 0) nTimes++;
int kindsOfCharacter = 0;
cout<<"总共需要读取次数="<<nTimes<<endl;
for (int i = 1; i <= nTimes; i++) {
char *str = (char *)calloc(1, (MAX_MEMORY)*sizeof(char));
LL numberOfCharacter = MAX_MEMORY;
if (i == nTimes) {
numberOfCharacter = fileSize % MAX_MEMORY;
}
readIn.read(str, numberOfCharacter * sizeof(char));
str[numberOfCharacter] = '\0';
cout<<"第"<<i<<"次读取的字符串的长度="<<strlen(str)<<endl;
//统计每个字符频率
int lenStr = strlen(str);
for (LL j = 0; j < lenStr; j++) {
node[str[j]].number++;
node[str[j]].c = str[j];
}
free(str);
}
CountKindOfCharacter();
int numberOfNode = BuildHuffmanTree();//进行HuffmanTree建立
CompressFile(filePath, compressFilePath, numberOfNode); //进行文件压缩
readIn.close();
}
int BuildHuffmanTree(){
//apply 2 * KIND_OF_CHARACTER to store nodes of the Huffman tree
Node* HT = (Node *)malloc((2 * KIND_OF_CHARACTER) * sizeof(Node));
priority_queue<Node> q;
int numberOfNode = 0;
//把所有节点压人优先队列
for (int i = 0; i < KIND_OF_CHARACTER; i++) {
if (node[i].number != 0) {
node[i].iNode = numberOfNode;
node[i].c = i;
// node[i].parent = 0;
q.push(node[i]);
HT[numberOfNode] = node[i];
numberOfNode++;
}
}
cout<<"源文本文件中字符种类:"<<numberOfNode<<endl;
int jNode = numberOfNode;
while (q.size() > 1){
Node leftNode = q.top();//从优先队列中弹出两个概率最小的Node
q.pop();
Node rightNode = q.top();
q.pop();
int l = leftNode.iNode;
int r = rightNode.iNode;
HT[l].parent = jNode;
HT[r].parent = jNode;
//完善父亲节点信息
HT[jNode].c = ' ';
HT[jNode].iNode = jNode;
HT[jNode].lChild = l;
HT[jNode].rChild = r;
HT[jNode].number = leftNode.number + rightNode.number;
q.push(HT[jNode]);
jNode++;
}
HT[jNode-1].parent = -1;
cout<<"HT的信息:"<<endl;
cout<<"序号 字符 字符权重 左儿子 右儿子 父亲"<<endl;
for (int i = 0; i < jNode; i++){
cout <<setw(3)<<i<<setw(5)<<HT[i].c<<setw(12)<<HT[i].number<<setw(8)<<HT[i].lChild<<setw(8)<<HT[i].rChild<<setw(10)<< HT[i].parent<<endl;
}
//提取每个字符的Huffman code
cout<<"每种字符对应的Huffman码:"<<endl;
for (int i = 0; i < numberOfNode; i++) {
int k = 0;
int l = i;
char ch = HT[i].c;
for (int j = HT[i].parent; j != -1; j = HT[j].parent) {
if (HT[j].lChild == l) {//判断i是j的左儿子还是右儿子
HuffmanCode[ch][k] = '0';
}
else {
HuffmanCode[ch][k] = '1';
}
l = j;
k++;//当前符号的Huffman码长度
}
//码字翻转得到Huffman码
for (int j = 0; j < k / 2; j++) {
char temp = HuffmanCode[ch][j];
HuffmanCode[ch][j] = HuffmanCode[ch][k-1-j];
HuffmanCode[ch][k-1-j] = temp;
}
HuffmanCode[ch][k] = '\0';
cout << ch << " " <<HuffmanCode[ch] << endl;
}
cout<<"字符种类="<<numberOfNode<<endl;
free(HT);
return numberOfNode;
}
void CompressFile(const char *filePath, const char *outPutFilePath, int numberOfNode){
ifstream readIn;
readIn.open(filePath, ios::binary);
if (readIn.is_open() == 0) {
cout << "Open failed!" << endl;
exit(0);
}
//对应的huffman码到输出文件
//写附加信息 number of bits added, OFFSET, size of original file.
//the number of kinds of character
ofstream Output;
Output.open(outPutFilePath, ios::binary);
if (Output.is_open() == 0) {
cout << "Open failed!!" << endl;
exit(0);
}
//计算文件大小
readIn.seekg(0, ios::end);
LL fileSize = (LL)readIn.tellg();
readIn.seekg(0, ios::beg);
//写附加信息到压缩文件
Output.seekp(OFFSET, ios::beg);
Output.write((char *)&fileSize, sizeof(LL));
Output.write((char *)&numberOfNode, sizeof(int));
//record the character and its Huffman code
for (int i = 0; i < KIND_OF_CHARACTER; i++) {
if (node[i].number != 0) {
Output.write((char *)&i, sizeof(char));
int bits = strlen(HuffmanCode[i]);
Output.write((char *)&bits, sizeof(int));
Output.write((char *)&HuffmanCode[i], bits*sizeof(char));
}
}
//批量读取数据,每次最多读MAX_MEMORY个字符并编码
int nTimes = (int)(fileSize / MAX_MEMORY);
if (fileSize % MAX_MEMORY != 0)
nTimes+=1;
int kindsOfCharacter = 0;
char *HTstr = (char *)calloc(1, (MAX_MEMORY+HUFFMAN_CODE_LENGTH)*sizeof(char));
int len = 0;
LL lenT = 0;
for (int i = 1; i <= nTimes; i++) {
char *str = (char *)calloc(1, (MAX_MEMORY+10)*sizeof(char));
LL numberOfCharacter = MAX_MEMORY;
if (i == nTimes) {
numberOfCharacter = fileSize % MAX_MEMORY;
}
readIn.read(str, numberOfCharacter * sizeof(char));
str[numberOfCharacter] = '\0';
for (LL j = 0; j < numberOfCharacter; j++) {
char ch = str[j];
lenT += strlen(HuffmanCode[ch]);
strcpy(HTstr+len, HuffmanCode[ch]);
len += strlen(HuffmanCode[ch]);
//write compressed file in batches
//when the length of encode string is greater than limited memory
if (len > MAX_MEMORY) {
// cout<<"****"<<endl;
LL leftBits = len % nBits;
LL changeLength = len - leftBits;
BitToInt(Output, HTstr, changeLength);
//if no left bits, no need to keep it.
strcpy(HTstr, HTstr+changeLength);
len = strlen(HTstr);
}
}
free(str);
}
if (len != 0) {
BitToInt(Output, HTstr, len);
}
free(HTstr);
readIn.close();
Output.close();
}
void BitToInt(ofstream &Output, char* HTstr, LL len) {
//add 0 to make the length of HTstr can be divide by 7
int k = 0;
if (len % nBits != 0) {
int bitsToAdd = nBits - (len % nBits);
streampos pos = Output.tellp();
Output.seekp(0, ios::beg);
Output.write((char *)&bitsToAdd, sizeof(int));
Output.write((char *)&OFFSET, sizeof(int));
Output.seekp(pos, ios::beg);
for (; k < bitsToAdd; k++){
HTstr[len+k] = '0';
}
HTstr[len+k] = '\0';
}
//char *buf = (char *)calloc(1, MAX_MEMORY * sizeof(char));
//convert bit to char
int pow = 1<<(nBits - 1);
int sum = 0;
for (LL i = 0, j = 0; i < len+k && HTstr[i]; i++) {
if (j == nBits){
Output.write((char *)&sum, sizeof(char));
j = 0;
sum = 0;
}
sum = sum + (HTstr[i]-'0') * (pow >> j);
j++;
}
// Output.write(buf, strlen(buf) * sizeof(char));
Output.write((char *)&sum, sizeof(char));
}
void CountKindOfCharacter(){
int kinds = 0;
for (int i = 0; i < KIND_OF_CHARACTER; i++) {
if (node[i].number != 0) {
cout << node[i].c << " " << node[i].number<<endl;
kinds++;
}
}
cout<<"源文本文件中字符种类:"<<kinds << endl;
}
程序运行结果:
对于graph.txt:
文件压缩前大小时2046KB,压缩之后大小为910KB;
压缩率=910/2046=44.5%
对于Aesop_Fables.txt:
文件压缩前大小时186KB,压缩之后大小为107KB
压缩率=107/186=57.5%
本文介绍了一种使用Huffman编码对文本文件进行压缩的方法,并通过具体实例展示了如何实现该算法及压缩效果。

3587

被折叠的 条评论
为什么被折叠?



