2021-04-06 16:22:03 +08:00
|
|
|
|
###########################################################################
|
|
|
|
|
# Copyright 2019 (C) Hui Lan <hui.lan@cantab.net>
|
|
|
|
|
# Written permission must be obtained from the author for commercial uses.
|
|
|
|
|
###########################################################################
|
|
|
|
|
|
|
|
|
|
import collections
|
|
|
|
|
import string
|
|
|
|
|
import operator
|
|
|
|
|
import os, sys # 引入模块sys,因为我要用里面的sys.argv列表中的信息来读取命令行参数。
|
|
|
|
|
import pickle_idea
|
|
|
|
|
|
|
|
|
|
def freq(fruit):
|
|
|
|
|
'''
|
|
|
|
|
功能: 把字符串转成列表。 目的是得到每个单词的频率。
|
|
|
|
|
输入: 字符串
|
|
|
|
|
输出: 列表, 列表里包含一组元组,每个元组包含单词与单词的频率。 比如 [('apple', 2), ('banana', 1)]
|
|
|
|
|
注意事项: 首先要把字符串转成小写。原因是。。。
|
|
|
|
|
'''
|
|
|
|
|
|
|
|
|
|
result = []
|
|
|
|
|
|
|
|
|
|
fruit = fruit.lower() # 字母转小写
|
|
|
|
|
flst = fruit.split() # 字符串转成list
|
|
|
|
|
c = collections.Counter(flst)
|
|
|
|
|
result = c.most_common()
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def youdao_link(s): # 有道链接
|
|
|
|
|
link = 'http://youdao.com/w/eng/' + s + '/#keyfrom=dict2.index'# 网址
|
|
|
|
|
return link
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def file2str(fname):#文件转字符
|
|
|
|
|
f = open(fname) #打开
|
|
|
|
|
s = f.read() #读取
|
|
|
|
|
f.close() #关闭
|
|
|
|
|
return s
|
|
|
|
|
|
2023-05-14 21:20:16 +08:00
|
|
|
|
|
|
|
|
|
def str2chararray(str): # 字符串转字符数组
|
|
|
|
|
chararray = []
|
|
|
|
|
for i in range(len(str)):
|
|
|
|
|
chararray.append(str[i])
|
|
|
|
|
return chararray
|
|
|
|
|
|
|
|
|
|
|
2021-04-06 16:22:03 +08:00
|
|
|
|
def remove_punctuation(s): # 这里是s是形参 (parameter)。函数被调用时才给s赋值。
|
2023-05-14 14:43:16 +08:00
|
|
|
|
|
2023-04-27 16:00:46 +08:00
|
|
|
|
# carr = str2chararray(s) # 字符串转字符数组
|
|
|
|
|
# print(carr)
|
|
|
|
|
special_characters = '&_~=+[]%^@.,?!:;#()"“”—‘’|/\\<>{}' # 把里面的字符都去掉
|
|
|
|
|
special_words = ('$lt', '$gt', '$') # 特殊词汇
|
2023-05-14 14:43:16 +08:00
|
|
|
|
|
2023-05-14 21:20:16 +08:00
|
|
|
|
special_characters = '\_©~<=>+-/[]*&$%^@.,?!:;#()"“”—‘’{}|' # 把里面的字符都去掉
|
|
|
|
|
|
2021-04-06 16:22:03 +08:00
|
|
|
|
for c in special_characters:
|
2023-04-27 16:00:46 +08:00
|
|
|
|
s = s.replace(c, ' ') # 防止出现把 apple,apple 移掉逗号后变成 appleapple 情况
|
2023-05-14 21:20:16 +08:00
|
|
|
|
carr = str2chararray(s) # 字符串转字符数组
|
|
|
|
|
for i, value in enumerate(carr):
|
|
|
|
|
if value == '&': # 遍历替换
|
|
|
|
|
carr[i] = '\''
|
|
|
|
|
print('sss' + s)
|
|
|
|
|
for j in range(1,5):
|
|
|
|
|
if carr[i+j] in [' ','3','9']:
|
|
|
|
|
carr[i + j]=''
|
|
|
|
|
s = ''.join(carr) # 字符数组转字符串
|
|
|
|
|
print('sss'+s)
|
|
|
|
|
for w in special_words: # 替换字符串中的剩余特殊字符
|
|
|
|
|
s = s.replace(w, ' ')
|
2021-04-06 16:22:03 +08:00
|
|
|
|
s = s.replace('--', ' ')
|
2023-04-27 16:00:46 +08:00
|
|
|
|
s = s.strip() # 去除前后的空格
|
|
|
|
|
|
2021-04-06 16:22:03 +08:00
|
|
|
|
if '\'' in s:
|
|
|
|
|
n = len(s)
|
2023-04-27 16:00:46 +08:00
|
|
|
|
t = '' # 用来收集我需要保留的字符
|
|
|
|
|
for i in range(n): # 只有单引号前后都有英文字符,才保留
|
2021-04-06 16:22:03 +08:00
|
|
|
|
if s[i] == '\'':
|
|
|
|
|
i_is_ok = i - 1 >= 0 and i + 1 < n
|
2023-04-27 16:00:46 +08:00
|
|
|
|
if i_is_ok and s[i - 1] in string.ascii_letters and s[i + 1] in string.ascii_letters:
|
2021-04-06 16:22:03 +08:00
|
|
|
|
t += s[i]
|
|
|
|
|
else:
|
|
|
|
|
t += s[i]
|
|
|
|
|
return t
|
|
|
|
|
else:
|
|
|
|
|
return s
|
|
|
|
|
|
|
|
|
|
|
2023-04-27 16:00:46 +08:00
|
|
|
|
|
2021-04-06 16:22:03 +08:00
|
|
|
|
def sort_in_descending_order(lst):# 单词按频率降序排列
|
|
|
|
|
lst2 = sorted(lst, reverse=True, key=lambda x: (x[1], x[0]))
|
|
|
|
|
return lst2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def sort_in_ascending_order(lst):# 单词按频率降序排列
|
|
|
|
|
lst2 = sorted(lst, reverse=False, key=lambda x: (x[1], x[0]))
|
|
|
|
|
return lst2
|
|
|
|
|
|
|
|
|
|
|
2023-03-21 18:57:00 +08:00
|
|
|
|
def make_html_page(lst, fname): # 只是在wordfreqCMD.py中的main函数中调用,所以不做修改
|
2021-04-06 16:22:03 +08:00
|
|
|
|
'''
|
|
|
|
|
功能:把lst的信息存到fname中,以html格式。
|
|
|
|
|
'''
|
|
|
|
|
s = ''
|
|
|
|
|
count = 1
|
|
|
|
|
for x in lst:
|
|
|
|
|
# <a href="">word</a>
|
|
|
|
|
s += '<p>%d <a href="%s">%s</a> (%d)</p>' % (count, youdao_link(x[0]), x[0], x[1])
|
|
|
|
|
count += 1
|
|
|
|
|
f = open(fname, 'w')
|
|
|
|
|
f.write(s)
|
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## main(程序入口)
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
num = len(sys.argv)
|
|
|
|
|
|
|
|
|
|
if num == 1: # 从键盘读入字符串
|
|
|
|
|
s = input()
|
|
|
|
|
elif num == 2: # 从文件读入字符串
|
|
|
|
|
fname = sys.argv[1]
|
|
|
|
|
s = file2str(fname)
|
|
|
|
|
else:
|
|
|
|
|
print('I can accept at most 2 arguments.')
|
|
|
|
|
sys.exit()# 结束程序运行, 下面的代码不会被执行了。
|
|
|
|
|
|
|
|
|
|
s = remove_punctuation(s) # 这里是s是实参(argument),里面有值
|
|
|
|
|
L = freq(s)
|
|
|
|
|
for x in sort_in_descending_order(L):
|
|
|
|
|
print('%s\t%d\t%s' % (x[0], x[1], youdao_link(x[0])))#函数导出
|
|
|
|
|
|
|
|
|
|
# 把频率的结果放result.html中
|
|
|
|
|
make_html_page(sort_in_descending_order(L), 'result.html')
|
|
|
|
|
|
|
|
|
|
print('\nHistory:\n')
|
|
|
|
|
if os.path.exists('frequency.p'):
|
|
|
|
|
d = pickle_idea.load_record('frequency.p')
|
|
|
|
|
else:
|
|
|
|
|
d = {}
|
|
|
|
|
|
|
|
|
|
print(sort_in_descending_order(pickle_idea.dict2lst(d)))
|
|
|
|
|
|
|
|
|
|
# 合并频率
|
|
|
|
|
lst_history = pickle_idea.dict2lst(d)
|
|
|
|
|
d = pickle_idea.merge_frequency(L, lst_history)
|
|
|
|
|
pickle_idea.save_frequency_to_pickle(d, 'frequency.p')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|