Huffman编码之文件压缩问题

本文介绍了一种使用Huffman编码对文本文件进行压缩的方法,并通过具体实例展示了如何实现该算法及压缩效果。

Huffman编码之文件压缩问题

1.实际问题

对于给定的.txt文本文件,用Huffman编码方法对.txt文件进行压缩,并计算压缩比。

2.程序实现

#include<iostream>
#include<iomanip>
#include<fstream>
#include<cstring>
#include<queue>
#include<algorithm>
#include<time.h>
using namespace std;
typedef long long LL;
const int FILE_LENGTH = 1000;
const long long MAX_MEMORY = 3 * 1024 * 1024;//每次读取最大字节数量 
const int KIND_OF_CHARACTER = 256;//ASCII字符种类 
const int HUFFMAN_CODE_LENGTH = 1000;//Huffman码最大长度 
const int OFFSET = 20;//the position of the size of original file in compressed file
const int nBits = 8;//以8比特形式存到压缩文件中 
struct Node {
    char c; //字符
    int parent, lChild, rChild;//父亲节点,左儿子,右儿子 
    int iNode; //节点序列号 
    LL number; //对应权重,出现次数 
    friend bool operator < (Node a, Node b) {
        return a.number > b.number;
    }
}node[KIND_OF_CHARACTER];

char HuffmanCode[KIND_OF_CHARACTER][HUFFMAN_CODE_LENGTH];
void CountKindOfCharacter(); 
int BuildHuffmanTree();
void CompressFile(const char *filePath, const char *outPutFilePath, int numberOfNode);
void BitToInt(ofstream &Output, char *HTstr, LL len);
int main() {
    char filePath[FILE_LENGTH] = "Aesop_Fables.txt";  
    char compressFilePath[FILE_LENGTH] = "Aesop_FablesFileCompressResult.txt";
   // char filePath[FILE_LENGTH] = "graph.txt";  
   // char compressFilePath[FILE_LENGTH] = "GraphFileCompressResult.txt";
    ifstream readIn;
    readIn.open(filePath, ios::binary);
    if (readIn.is_open() == 0) {
        cout <<"Open failed!" << endl;
        exit(0);
    }
     //计算所文件大小 
    readIn.seekg(0, ios::end);
    LL fileSize = (LL)readIn.tellg();  //文件长度,字符总数量 
    readIn.seekg(0, ios::beg);
    cout<<"fileSize=" <<fileSize<<endl;
    //read data in batches, each time read MAX_MEMORY characters
    int nTimes = (int)(fileSize / MAX_MEMORY);
    if (fileSize % MAX_MEMORY != 0) nTimes++;
    int kindsOfCharacter = 0;

    cout<<"总共需要读取次数="<<nTimes<<endl;
    for (int i = 1; i <= nTimes; i++) {
        char *str = (char *)calloc(1, (MAX_MEMORY)*sizeof(char));

        LL numberOfCharacter = MAX_MEMORY;
        if (i == nTimes) {
            numberOfCharacter = fileSize % MAX_MEMORY;
        }
        readIn.read(str, numberOfCharacter * sizeof(char));
        str[numberOfCharacter] = '\0';
        cout<<"第"<<i<<"次读取的字符串的长度="<<strlen(str)<<endl;

        //统计每个字符频率 
        int lenStr = strlen(str);
        for (LL j = 0; j < lenStr; j++) {
            node[str[j]].number++;
            node[str[j]].c = str[j];
        }
        free(str);
    }
    CountKindOfCharacter();

    int numberOfNode = BuildHuffmanTree();//进行HuffmanTree建立
    CompressFile(filePath, compressFilePath, numberOfNode); //进行文件压缩 
    readIn.close();
}

int BuildHuffmanTree(){
    //apply 2 * KIND_OF_CHARACTER to store nodes of the Huffman tree
    Node* HT = (Node *)malloc((2 * KIND_OF_CHARACTER) * sizeof(Node));

    priority_queue<Node> q;
    int  numberOfNode = 0;
    //把所有节点压人优先队列 
    for (int i = 0; i < KIND_OF_CHARACTER; i++) {
        if (node[i].number != 0) {
            node[i].iNode = numberOfNode;
            node[i].c = i;
           // node[i].parent = 0;
            q.push(node[i]);
            HT[numberOfNode] = node[i];
            numberOfNode++;
        }
    }
    cout<<"源文本文件中字符种类:"<<numberOfNode<<endl;
    int jNode = numberOfNode;
    while (q.size() > 1){
        Node leftNode = q.top();//从优先队列中弹出两个概率最小的Node 
        q.pop();
        Node rightNode = q.top();
        q.pop();
        int l = leftNode.iNode;
        int r = rightNode.iNode;
        HT[l].parent = jNode;
        HT[r].parent = jNode; 
        //完善父亲节点信息 
        HT[jNode].c = ' ';
        HT[jNode].iNode = jNode;
        HT[jNode].lChild = l;
        HT[jNode].rChild = r;
        HT[jNode].number = leftNode.number + rightNode.number;
        q.push(HT[jNode]);
        jNode++;
    }
    HT[jNode-1].parent = -1;
    cout<<"HT的信息:"<<endl; 
    cout<<"序号  字符    字符权重   左儿子   右儿子   父亲"<<endl;
    for (int i = 0; i < jNode; i++){
        cout <<setw(3)<<i<<setw(5)<<HT[i].c<<setw(12)<<HT[i].number<<setw(8)<<HT[i].lChild<<setw(8)<<HT[i].rChild<<setw(10)<< HT[i].parent<<endl;

    }
    //提取每个字符的Huffman code 
    cout<<"每种字符对应的Huffman码:"<<endl;
    for (int i = 0; i < numberOfNode; i++) {
        int k = 0;
        int l = i;
        char ch = HT[i].c;
        for (int j = HT[i].parent; j != -1; j = HT[j].parent) {

            if (HT[j].lChild == l) {//判断i是j的左儿子还是右儿子 
                HuffmanCode[ch][k] = '0';
            }
            else {
                HuffmanCode[ch][k] = '1';
            }
            l = j;
            k++;//当前符号的Huffman码长度 
        }
        //码字翻转得到Huffman码 
        for (int j = 0; j < k / 2; j++) {
            char temp = HuffmanCode[ch][j];
            HuffmanCode[ch][j] = HuffmanCode[ch][k-1-j];
            HuffmanCode[ch][k-1-j] = temp;
        }
        HuffmanCode[ch][k] = '\0';
        cout << ch << " " <<HuffmanCode[ch] << endl;

    }
    cout<<"字符种类="<<numberOfNode<<endl;
    free(HT);
    return numberOfNode;
}

void CompressFile(const char *filePath, const char *outPutFilePath, int numberOfNode){
    ifstream readIn;
    readIn.open(filePath, ios::binary);
    if (readIn.is_open() == 0) {
        cout << "Open failed!" << endl;
        exit(0);
    }
    //对应的huffman码到输出文件 
    //写附加信息 number of bits added, OFFSET, size of original file. 
    //the number of kinds of character
    ofstream Output;
    Output.open(outPutFilePath, ios::binary);
    if (Output.is_open() == 0) {
        cout << "Open failed!!" << endl;
        exit(0);
    }

    //计算文件大小 
    readIn.seekg(0, ios::end);
    LL fileSize = (LL)readIn.tellg();
    readIn.seekg(0, ios::beg);
    //写附加信息到压缩文件 
    Output.seekp(OFFSET, ios::beg);
    Output.write((char *)&fileSize, sizeof(LL));
    Output.write((char *)&numberOfNode, sizeof(int));
    //record the character and its Huffman code
    for (int i = 0; i < KIND_OF_CHARACTER; i++) {
        if (node[i].number != 0) {
            Output.write((char *)&i, sizeof(char));
            int bits = strlen(HuffmanCode[i]);
            Output.write((char *)&bits, sizeof(int));
            Output.write((char *)&HuffmanCode[i], bits*sizeof(char));
        }
    }
    //批量读取数据,每次最多读MAX_MEMORY个字符并编码
    int nTimes = (int)(fileSize / MAX_MEMORY);
    if (fileSize % MAX_MEMORY != 0) 
        nTimes+=1;
    int kindsOfCharacter = 0;
    char *HTstr = (char *)calloc(1, (MAX_MEMORY+HUFFMAN_CODE_LENGTH)*sizeof(char));
    int len = 0;
    LL lenT = 0;
    for (int i = 1; i <= nTimes; i++) {
        char *str = (char *)calloc(1, (MAX_MEMORY+10)*sizeof(char));
        LL numberOfCharacter = MAX_MEMORY;
        if (i == nTimes) {
            numberOfCharacter = fileSize % MAX_MEMORY;
        }
        readIn.read(str, numberOfCharacter * sizeof(char));
        str[numberOfCharacter] = '\0';
        for (LL j = 0; j < numberOfCharacter; j++) {
            char ch = str[j];
            lenT += strlen(HuffmanCode[ch]);
            strcpy(HTstr+len, HuffmanCode[ch]);
            len += strlen(HuffmanCode[ch]);
            //write compressed file in batches
            //when the length of encode string is greater than limited memory
            if (len > MAX_MEMORY) {
               // cout<<"****"<<endl;
                LL leftBits = len % nBits;
                LL changeLength = len - leftBits;
                BitToInt(Output, HTstr, changeLength);
                //if no left bits, no need to keep it.
                strcpy(HTstr,  HTstr+changeLength);
                len = strlen(HTstr);
            }
        }
        free(str);
    }

    if (len != 0) {
        BitToInt(Output, HTstr, len);
    }
    free(HTstr);
    readIn.close();
    Output.close();
}

void BitToInt(ofstream &Output, char* HTstr, LL len) {
    //add 0 to make the length of HTstr can be divide by 7
    int k = 0;
    if (len % nBits != 0) {
        int bitsToAdd = nBits - (len % nBits);
        streampos pos = Output.tellp();
        Output.seekp(0, ios::beg);
        Output.write((char *)&bitsToAdd, sizeof(int));
        Output.write((char *)&OFFSET, sizeof(int));
        Output.seekp(pos, ios::beg);
        for (; k < bitsToAdd; k++){
            HTstr[len+k] = '0';
        }
        HTstr[len+k] = '\0';

    }
    //char *buf = (char *)calloc(1, MAX_MEMORY * sizeof(char));
    //convert bit to char
    int pow = 1<<(nBits - 1);
    int sum = 0;
    for (LL i = 0, j = 0; i < len+k && HTstr[i]; i++) {
        if (j == nBits){
            Output.write((char *)&sum, sizeof(char));

            j = 0;
            sum = 0;
        }
        sum = sum + (HTstr[i]-'0') * (pow >> j);
        j++;
    }
   // Output.write(buf, strlen(buf) * sizeof(char));
    Output.write((char *)&sum, sizeof(char));
}
void CountKindOfCharacter(){
    int kinds = 0;
    for (int i = 0; i < KIND_OF_CHARACTER; i++) {
        if (node[i].number != 0) {

            cout << node[i].c << " " << node[i].number<<endl;
            kinds++;
        }
    }
    cout<<"源文本文件中字符种类:"<<kinds << endl; 
}

程序运行结果:
这里写图片描述
对于graph.txt:
文件压缩前大小时2046KB,压缩之后大小为910KB;
压缩率=910/2046=44.5%
对于Aesop_Fables.txt:
文件压缩前大小时186KB,压缩之后大小为107KB
压缩率=107/186=57.5%

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值