# -*- coding: utf-8 -*-
# @Time : 2020/10/16 14:09 PM
# @Author : yangkaitong
# @FileName: preprocessing.py
######################################################################
## win下bat运行该命令,会将chm转化为txt,部分会转化成html ##
## hh -decompile D:\Desktop\ 123.chm ##
## 遗憾的是部分会是html,所以该脚本旨在转化html为excel ##
######################################################################
import re
import os
import pandas as pd
def normalizing(text:str):
new_text = []
line = text.split("\n")
drop_line_flag = ["Top "," ","Previous ","body","Next"]
replace_line_flag = ["?"]
def drop_flag(cur_line):
for flag in drop_line_flag:
if flag in cur_line:
return True
return False
for cur_line in line:
cur_line = cur_line.strip()
if cur_line == "":
continue
elif drop_flag(cu