#-*-coding:utf-8-*-
import pandas as pd
list=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L']
word_list=[]
for i in range(0,len(list)):
for j in range(0,len(list)):
word_list.append([list[i],list[j]])
data={}
for k in range(0,10):
with open(str(k)+"_keywords.txt",'r',encoding='utf-8',errors='ignore') as f:
txt_list=f.read().strip().split("\n")
for line in txt_list:
for l in word_list:
if l[0] in line and l[1]in line:
key=str(l[0])+';'+str(l[1])+';'
if key in data.keys():
data[key]=data[key]+1
else:
data.update({key:1})
print(str(k)+' 完成')
data_only={}
for key in data.keys():
key_list=key.split(';')
fkey=key_list[1]+';'+key_list[0]+';'
kkey =key_list[1]+';'+key_list[1]+';'
if fkey in data_only.keys() :
print(fkey+' **已经在其中不在录入** '+key)
continue
elif key==kkey :
print(key + ' **为叠词不计入统计' )
else:
data_only.update({key:data[key]})
# for key,value in data_only.items():
# with open("关键词共现次数(top10).txt",'a+',encoding='utf-8',errors='ignore') as f:
# f.write(key+str(value)+';'+'\n')
for i in range(0,len(list)):
with open("词频共现矩阵(top10).txt", "a+", encoding='utf-8', errors='ignore') as f:
f.write(list[i] + '\t')
for j in range(0,len(list)):
with open("词频共现矩阵(top10).txt", "a+", encoding='utf-8', errors='ignore') as f:
f.write(str(data[list[i]+list[j]])+'\t')
else:
with open("词频共现矩阵(top10).txt", "a+", encoding='utf-8', errors='ignore') as f:
f.write(str(0) + '\t')
with open("词频共现矩阵(top10).txt", "a+", encoding='utf-8', errors='ignore') as f:
f.write("\n")
作者:WangB
本文介绍了如何利用Python对文本数据进行共现统计分析,包括词频计算、共现矩阵构建以及相关性分析,帮助理解文本中词汇的关系。

552

被折叠的 条评论
为什么被折叠?



