EnglishPal/wordfreqCMD.py

202 lines
6.8 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

###########################################################################
# Copyright 2019 (C) Hui Lan <hui.lan@cantab.net>
# Written permission must be obtained from the author for commercial uses.
###########################################################################
import collections
import html
import string
import operator
import os, sys # 引入模块sys因为我要用里面的sys.argv列表中的信息来读取命令行参数。
import pickle_idea
import pickle
from datetime import datetime
from pickle_idea2 import load_record, save_frequency_to_pickle, lst2dict, dict2lst
def map_percentages_to_levels(percentages):
'''
功能:按照加权平均难度,给生词本计算难度分,计算权重的规则是(10 - 该词汇难度) * 该难度词汇占总词汇的比例,再进行归一化处理
输入难度占比字典键代表难度3~8值代表每种难度的单词的占比
输出权重字典键代表难度3~8值代表每种难度的单词的权重
'''
# 已排序的键
sorted_keys = sorted(percentages.keys())
# 计算权重和权重总和
sum = 0 # 总和
levels_proportions = {}
for k in sorted_keys:
levels_proportions[k] = 10 - k
for k in sorted_keys:
levels_proportions[k] *= percentages[k]
sum += levels_proportions[k]
# 归一化权重到权重总和为1
for k in sorted_keys:
levels_proportions[k] /= sum
return levels_proportions
def freq(fruit):
'''
功能: 把字符串转成列表。 目的是得到每个单词的频率。
输入: 字符串
输出: 列表, 列表里包含一组元组,每个元组包含单词与单词的频率。 比如 [('apple', 2), ('banana', 1)]
注意事项: 首先要把字符串转成小写。原因是。。。
'''
result = []
fruit = fruit.lower() # 字母转小写
flst = fruit.split() # 字符串转成list
c = collections.Counter(flst)
result = c.most_common()
return result
def youdao_link(s): # 有道链接
link = 'http://youdao.com/w/eng/' + s + '/#keyfrom=dict2.index'# 网址
return link
def file2str(fname):#文件转字符
f = open(fname) #打开
s = f.read() #读取
f.close() #关闭
return s
def remove_punctuation(s): # 这里是s是形参 (parameter)。函数被调用时才给s赋值。
special_characters = '\_©~<=>+/[]*&$%^@.,?!:;#()"“”—‘’{}|,。?!¥……()、《》【】:;·' # 把里面的字符都去掉
s = html.unescape(s) # 将HTML实体转换为对应的字符比如<会被识别为小于号
for c in special_characters:
s = s.replace(c, ' ') # 防止出现把 apple,apple 移掉逗号后变成 appleapple 情况
s = s.replace('--', ' ')
s = s.strip() # 去除前后的空格
if '\'' in s:
n = len(s)
t = '' # 用来收集我需要保留的字符
for i in range(n): # 只有单引号前后都有英文字符,才保留
if s[i] == '\'':
i_is_ok = i - 1 >= 0 and i + 1 < n
if i_is_ok and s[i-1] in string.ascii_letters and s[i+1] in string.ascii_letters:
t += s[i]
else:
t += s[i]
return t
else:
return s
def sort_in_descending_order(lst):# 单词按频率降序排列
lst2 = sorted(lst, reverse=True, key=lambda x: (x[1], x[0]))
return lst2
def sort_in_ascending_order(lst):# 单词按频率降序排列
lst2 = sorted(lst, reverse=False, key=lambda x: (x[1], x[0]))
return lst2
def make_html_page(lst, fname): # 只是在wordfreqCMD.py中的main函数中调用所以不做修改
'''
功能把lst的信息存到fname中以html格式。
'''
s = ''
count = 1
for x in lst:
# <a href="">word</a>
s += '<p>%d <a href="%s">%s</a> (%d)</p>' % (count, youdao_link(x[0]), x[0], x[1])
count += 1
f = open(fname, 'w')
f.write(s)
f.close()
class WordFreq:
def __init__(self):
self.pickle_file = 'frequency.p' # Add this to store cumulative data
def process_file(self, filename):
# ... existing word processing code ...
# Convert current word frequencies to timestamp format
current_words = {}
timestamp = datetime.now().strftime('%Y%m%d%H%M')
for word, freq in self.freq.items():
current_words[word] = [timestamp] * freq # Create list of timestamps for each occurrence
# Load existing cumulative data
try:
cumulative_data = load_record(self.pickle_file)
except (FileNotFoundError, EOFError):
cumulative_data = {}
# Merge current words with historical data
for word, timestamps in current_words.items():
if word in cumulative_data:
cumulative_data[word].extend(timestamps)
else:
cumulative_data[word] = timestamps
# Save updated data
save_frequency_to_pickle(cumulative_data, self.pickle_file)
def show_results(self):
# ... existing code ...
# Add cumulative frequency display
print("\nCumulative Frequencies (all-time):")
try:
cumulative_data = load_record(self.pickle_file)
# Sort by cumulative frequency (length of timestamp list)
sorted_words = sorted(cumulative_data.items(),
key=lambda x: len(x[1]),
reverse=True)
for word, timestamps in sorted_words[:20]: # Show top 20
print(f"{word}: {len(timestamps)} times")
except (FileNotFoundError, EOFError):
print("No cumulative data available yet")
## main程序入口
if __name__ == '__main__':
num = len(sys.argv)
if num == 1: # 从键盘读入字符串
s = input()
elif num == 2: # 从文件读入字符串
fname = sys.argv[1]
s = file2str(fname)
else:
print('I can accept at most 2 arguments.')
sys.exit()# 结束程序运行, 下面的代码不会被执行了。
s = remove_punctuation(s) # 这里是s是实参(argument),里面有值
L = freq(s)
for x in sort_in_descending_order(L):
print('%s\t%d\t%s' % (x[0], x[1], youdao_link(x[0])))#函数导出
# 把频率的结果放result.html中
make_html_page(sort_in_descending_order(L), 'result.html')
print('\nHistory:\n')
if os.path.exists('frequency.p'):
d = pickle_idea.load_record('frequency.p')
else:
d = {}
print(sort_in_descending_order(pickle_idea.dict2lst(d)))
# 合并频率
lst_history = pickle_idea.dict2lst(d)
d = pickle_idea.merge_frequency(L, lst_history)
pickle_idea.save_frequency_to_pickle(d, 'frequency.p')