summaryrefslogtreecommitdiff
path: root/app/wordfreqCMD.py
diff options
context:
space:
mode:
authorHui Lan <lanhui@zjnu.edu.cn>2019-11-01 20:51:19 +0800
committerHui Lan <lanhui@zjnu.edu.cn>2019-11-01 20:51:19 +0800
commita8f6a99bb3d3dba85705ed7df93145c28168d659 (patch)
treeb21a324eb857b0b236ea76f22692e18420258e45 /app/wordfreqCMD.py
englishpal: first commit
Diffstat (limited to 'app/wordfreqCMD.py')
-rw-r--r--app/wordfreqCMD.py118
1 files changed, 118 insertions, 0 deletions
diff --git a/app/wordfreqCMD.py b/app/wordfreqCMD.py
new file mode 100644
index 0000000..e45cf78
--- /dev/null
+++ b/app/wordfreqCMD.py
@@ -0,0 +1,118 @@
+import collections
+import string
+import operator
+import os, sys # 引入模块sys,因为我要用里面的sys.argv列表中的信息来读取命令行参数。
+import pickle_idea
+
+def freq(fruit):
+ '''
+ 功能: 把字符串转成列表。 目的是得到每个单词的频率。
+ 输入: 字符串
+ 输出: 列表, 列表里包含一组元组,每个元组包含单词与单词的频率。 比如 [('apple', 2), ('banana', 1)]
+ 注意事项: 首先要把字符串转成小写。原因是。。。
+ '''
+
+ result = []
+
+ fruit = fruit.lower() # 字母转小写
+ flst = fruit.split() # 字符串转成list
+ c = collections.Counter(flst)
+ result = c.most_common()
+ return result
+
+
+def youdao_link(s): # 有道链接
+ link = 'http://youdao.com/w/eng/' + s + '/#keyfrom=dict2.index'# 网址
+ return link
+
+
+def file2str(fname):#文件转字符
+ f = open(fname) #打开
+ s = f.read() #读取
+ f.close() #关闭
+ return s
+
+
+def remove_punctuation(s): # 这里是s是形参 (parameter)。函数被调用时才给s赋值。
+ special_characters = '_©~=+[]*&$%^@.,?!:;#()"“”—‘’' # 把里面的字符都去掉
+ for c in special_characters:
+ s = s.replace(c, ' ') # 防止出现把 apple,apple 移掉逗号后变成 appleapple 情况
+ s = s.replace('--', ' ')
+ s = s.strip() # 去除前后的空格
+
+ if '\'' in s:
+ n = len(s)
+ t = '' # 用来收集我需要保留的字符
+ for i in range(n): # 只有单引号前后都有英文字符,才保留
+ if s[i] == '\'':
+ i_is_ok = i - 1 >= 0 and i + 1 < n
+ if i_is_ok and s[i-1] in string.ascii_letters and s[i+1] in string.ascii_letters:
+ t += s[i]
+ else:
+ t += s[i]
+ return t
+ else:
+ return s
+
+
+def sort_in_descending_order(lst):# 单词按频率降序排列
+ lst2 = sorted(lst, reverse=True, key=lambda x: (x[1], x[0]))
+ return lst2
+
+
+def sort_in_ascending_order(lst):# 单词按频率降序排列
+ lst2 = sorted(lst, reverse=False, key=lambda x: (x[1], x[0]))
+ return lst2
+
+
+def make_html_page(lst, fname):
+ '''
+ 功能:把lst的信息存到fname中,以html格式。
+ '''
+ s = ''
+ count = 1
+ for x in lst:
+ # <a href="">word</a>
+ s += '<p>%d <a href="%s">%s</a> (%d)</p>' % (count, youdao_link(x[0]), x[0], x[1])
+ count += 1
+ f = open(fname, 'w')
+ f.write(s)
+ f.close()
+
+
+## main(程序入口)
+if __name__ == '__main__':
+ num = len(sys.argv)
+
+ if num == 1: # 从键盘读入字符串
+ s = input()
+ elif num == 2: # 从文件读入字符串
+ fname = sys.argv[1]
+ s = file2str(fname)
+ else:
+ print('I can accept at most 2 arguments.')
+ sys.exit()# 结束程序运行, 下面的代码不会被执行了。
+
+ s = remove_punctuation(s) # 这里是s是实参(argument),里面有值
+ L = freq(s)
+ for x in sort_in_descending_order(L):
+ print('%s\t%d\t%s' % (x[0], x[1], youdao_link(x[0])))#函数导出
+
+ # 把频率的结果放result.html中
+ make_html_page(sort_in_descending_order(L), 'result.html')
+
+ print('\nHistory:\n')
+ if os.path.exists('frequency.p'):
+ d = pickle_idea.load_record('frequency.p')
+ else:
+ d = {}
+
+ print(sort_in_descending_order(pickle_idea.dict2lst(d)))
+
+ # 合并频率
+ lst_history = pickle_idea.dict2lst(d)
+ d = pickle_idea.merge_frequency(L, lst_history)
+ pickle_idea.save_frequency_to_pickle(d, 'frequency.p')
+
+
+