########################################################################### # Copyright 2019 (C) Hui Lan # Written permission must be obtained from the author for commercial uses. ########################################################################### import collections import string import operator import os, sys # 引入模块sys，因为我要用里面的sys.argv列表中的信息来读取命令行参数。 import pickle_idea def freq(fruit): ''' 功能：把字符串转成列表。目的是得到每个单词的频率。输入：字符串输出：列表，列表里包含一组元组，每个元组包含单词与单词的频率。比如 [('apple', 2), ('banana', 1)] 注意事项：首先要把字符串转成小写。原因是。。。 ''' result = [] fruit = fruit.lower() # 字母转小写 flst = fruit.split() # 字符串转成list c = collections.Counter(flst) result = c.most_common() return result def youdao_link(s): # 有道链接 link = 'http://youdao.com/w/eng/' + s + '/#keyfrom=dict2.index'# 网址 return link def file2str(fname):#文件转字符 f = open(fname) #打开 s = f.read() #读取 f.close() #关闭 return s def str2chararray(str): # 字符串转字符数组 chararray = [] for i in range(len(str)): chararray.append(str[i]) return chararray def remove_punctuation(s): # 这里是s是形参 (parameter)。函数被调用时才给s赋值。 # carr = str2chararray(s) # 字符串转字符数组 # print(carr) special_characters = '&_~=+[]%^@.,?!:;#()"“”—‘’|/\\<>{}' # 把里面的字符都去掉 special_words = ('$lt', '$gt', '$') # 特殊词汇 special_characters = '\_©~<=>+-/[]*&$%^@.,?!:;#()"“”—‘’{}|' # 把里面的字符都去掉 for c in special_characters: s = s.replace(c, ' ') # 防止出现把 apple,apple 移掉逗号后变成 appleapple 情况 carr = str2chararray(s) # 字符串转字符数组 for i, value in enumerate(carr): if value == '&': # 遍历替换 carr[i] = '\'' print('sss' + s) for j in range(1,5): if carr[i+j] in [' ','3','9']: carr[i + j]='' s = ''.join(carr) # 字符数组转字符串 print('sss'+s) for w in special_words: # 替换字符串中的剩余特殊字符 s = s.replace(w, ' ') s = s.replace('--', ' ') s = s.strip() # 去除前后的空格 if '\'' in s: n = len(s) t = '' # 用来收集我需要保留的字符 for i in range(n): # 只有单引号前后都有英文字符，才保留 if s[i] == '\'': i_is_ok = i - 1 >= 0 and i + 1 < n if i_is_ok and s[i - 1] in string.ascii_letters and s[i + 1] in string.ascii_letters: t += s[i] else: t += s[i] return t else: return s def sort_in_descending_order(lst):# 单词按频率降序排列 lst2 = sorted(lst, reverse=True, key=lambda x: (x[1], x[0])) return lst2 def sort_in_ascending_order(lst):# 单词按频率降序排列 lst2 = sorted(lst, reverse=False, key=lambda x: (x[1], x[0])) return lst2 def make_html_page(lst, fname): # 只是在wordfreqCMD.py中的main函数中调用，所以不做修改 ''' 功能：把lst的信息存到fname中，以html格式。 ''' s = '' count = 1 for x in lst: # word s += '

%d %s (%d)

' % (count, youdao_link(x[0]), x[0], x[1]) count += 1 f = open(fname, 'w') f.write(s) f.close() ## main（程序入口） if __name__ == '__main__': num = len(sys.argv) if num == 1: # 从键盘读入字符串 s = input() elif num == 2: # 从文件读入字符串 fname = sys.argv[1] s = file2str(fname) else: print('I can accept at most 2 arguments.') sys.exit()# 结束程序运行，下面的代码不会被执行了。 s = remove_punctuation(s) # 这里是s是实参(argument)，里面有值 L = freq(s) for x in sort_in_descending_order(L): print('%s\t%d\t%s' % (x[0], x[1], youdao_link(x[0])))#函数导出 # 把频率的结果放result.html中 make_html_page(sort_in_descending_order(L), 'result.html') print('\nHistory:\n') if os.path.exists('frequency.p'): d = pickle_idea.load_record('frequency.p') else: d = {} print(sort_in_descending_order(pickle_idea.dict2lst(d))) # 合并频率 lst_history = pickle_idea.dict2lst(d) d = pickle_idea.merge_frequency(L, lst_history) pickle_idea.save_frequency_to_pickle(d, 'frequency.p')