| 
									
										
										
										
											2021-04-06 16:22:03 +08:00
										 |  |  |  | ########################################################################### | 
					
						
							|  |  |  |  | # Copyright 2019 (C) Hui Lan <hui.lan@cantab.net> | 
					
						
							|  |  |  |  | # Written permission must be obtained from the author for commercial uses. | 
					
						
							|  |  |  |  | ########################################################################### | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | import collections | 
					
						
							|  |  |  |  | import string | 
					
						
							|  |  |  |  | import operator | 
					
						
							|  |  |  |  | import os, sys # 引入模块sys,因为我要用里面的sys.argv列表中的信息来读取命令行参数。 | 
					
						
							|  |  |  |  | import pickle_idea | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | def freq(fruit): | 
					
						
							|  |  |  |  |     '''
 | 
					
						
							|  |  |  |  |     功能: 把字符串转成列表。 目的是得到每个单词的频率。 | 
					
						
							|  |  |  |  |     输入: 字符串 | 
					
						
							|  |  |  |  |     输出: 列表, 列表里包含一组元组,每个元组包含单词与单词的频率。 比如 [('apple', 2), ('banana', 1)] | 
					
						
							|  |  |  |  |     注意事项: 首先要把字符串转成小写。原因是。。。 | 
					
						
							|  |  |  |  |     '''
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     result = [] | 
					
						
							|  |  |  |  |     fruit = fruit.lower() # 字母转小写 | 
					
						
							|  |  |  |  |     flst = fruit.split()  # 字符串转成list | 
					
						
							|  |  |  |  |     c = collections.Counter(flst) | 
					
						
							|  |  |  |  |     result = c.most_common() | 
					
						
							|  |  |  |  |     return result | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | def youdao_link(s): # 有道链接 | 
					
						
							| 
									
										
										
										
											2022-06-06 19:42:26 +08:00
										 |  |  |  |     link = 'http://youdao.com/w/eng/' + s + '/#keyfrom=dict2.index' # 网址 | 
					
						
							| 
									
										
										
										
											2021-04-06 16:22:03 +08:00
										 |  |  |  |     return link | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-06-06 19:42:26 +08:00
										 |  |  |  | def file2str(fname): # 文件转字符 | 
					
						
							|  |  |  |  |     f = open(fname)  # 打开 | 
					
						
							|  |  |  |  |     s = f.read()     # 读取 | 
					
						
							|  |  |  |  |     f.close()        # 关闭 | 
					
						
							| 
									
										
										
										
											2021-04-06 16:22:03 +08:00
										 |  |  |  |     return s | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | def remove_punctuation(s): # 这里是s是形参 (parameter)。函数被调用时才给s赋值。 | 
					
						
							| 
									
										
										
										
											2022-06-06 19:42:26 +08:00
										 |  |  |  |     special_characters = '_©~=+[]*&$%^@.,?!:;#()"“”—' # 把里面的字符都去掉 | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-04-06 16:22:03 +08:00
										 |  |  |  |     for c in special_characters: | 
					
						
							| 
									
										
										
										
											2022-06-06 19:42:26 +08:00
										 |  |  |  |         s = s.replace(c, ' ') # 把所有符号都替换成空格,防止出现把 apple,apple 移掉逗号后变成 appleapple 情况 | 
					
						
							| 
									
										
										
										
											2021-04-06 16:22:03 +08:00
										 |  |  |  |     s = s.replace('--', ' ') | 
					
						
							|  |  |  |  |     s = s.strip() # 去除前后的空格 | 
					
						
							| 
									
										
										
										
											2022-06-06 19:42:26 +08:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |     single_quote = '‘’\'' # 各种单引号单独处理 | 
					
						
							|  |  |  |  |     n, i = len(s), 0 | 
					
						
							|  |  |  |  |     t = ''  # 用来收集我需要保留的字符 | 
					
						
							|  |  |  |  |     while i < n:  # 只有单引号前后都有英文字符,才保留 | 
					
						
							|  |  |  |  |         if s[i] in single_quote: | 
					
						
							|  |  |  |  |             if i == 0 or i == n - 1 or s[i - 1] == ' ' or s[i + 1] == ' ': | 
					
						
							|  |  |  |  |                 i = i + 1 | 
					
						
							|  |  |  |  |                 continue # condition 1+2 | 
					
						
							|  |  |  |  |             if s[i + 1] == 's' and (i + 2 == n or s[i + 2] == ' '): | 
					
						
							|  |  |  |  |                 i = i + 2 | 
					
						
							|  |  |  |  |                 continue # condition 2 | 
					
						
							|  |  |  |  |             t += '\'' # condition 3, standardize quote | 
					
						
							|  |  |  |  |         else: | 
					
						
							|  |  |  |  |             t += s[i] | 
					
						
							|  |  |  |  |         i = i + 1 | 
					
						
							|  |  |  |  |     return t | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     '''
 | 
					
						
							|  |  |  |  |     单引号出现在文章中的情况: | 
					
						
							|  |  |  |  |     1、某些情况下作为双引号使用,引用段落 | 
					
						
							|  |  |  |  |         这种情况一般出现在词首或词尾 | 
					
						
							|  |  |  |  |         处理方式:直接去除 | 
					
						
							|  |  |  |  |     2、表示名词所有格 | 
					
						
							|  |  |  |  |         对于单数名词以's为后缀,对于复数名词以s'或'为后缀 | 
					
						
							|  |  |  |  |         处理方式:将'其后的部分去除 | 
					
						
							|  |  |  |  |     3、单词元音位置的缩写 | 
					
						
							|  |  |  |  |         最常见的有not->n't/is->'s/have->'ve这类 | 
					
						
							|  |  |  |  |         处理方式:保留 | 
					
						
							| 
									
										
										
										
											2021-04-06 16:22:03 +08:00
										 |  |  |  |      | 
					
						
							| 
									
										
										
										
											2022-06-06 19:42:26 +08:00
										 |  |  |  |     上述处理方式2/3两点可能产生一种冲突: | 
					
						
							|  |  |  |  |         某些单词元音缩写后恰好以's结尾 | 
					
						
							|  |  |  |  |         但考虑到用于学习英语的文章一般不会出现过于口语化的缩写单词 | 
					
						
							|  |  |  |  |         因此要么还是表所有格,要么就是is的缩写 | 
					
						
							|  |  |  |  |         故不考虑这种冲突情况 | 
					
						
							|  |  |  |  |     '''
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     '''
 | 
					
						
							|  |  |  |  |     以下是原本的代码 | 
					
						
							| 
									
										
										
										
											2021-04-06 16:22:03 +08:00
										 |  |  |  |     if '\'' in s: | 
					
						
							|  |  |  |  |         n = len(s) | 
					
						
							|  |  |  |  |         t = '' # 用来收集我需要保留的字符 | 
					
						
							|  |  |  |  |         for i in range(n): # 只有单引号前后都有英文字符,才保留 | 
					
						
							|  |  |  |  |             if s[i] == '\'': | 
					
						
							|  |  |  |  |                 i_is_ok = i - 1 >= 0 and i + 1 < n | 
					
						
							|  |  |  |  |                 if i_is_ok and s[i-1] in string.ascii_letters and s[i+1] in string.ascii_letters: | 
					
						
							|  |  |  |  |                     t += s[i] | 
					
						
							|  |  |  |  |             else: | 
					
						
							|  |  |  |  |                 t += s[i] | 
					
						
							|  |  |  |  |         return t | 
					
						
							|  |  |  |  |     else: | 
					
						
							|  |  |  |  |         return s | 
					
						
							| 
									
										
										
										
											2022-06-06 19:42:26 +08:00
										 |  |  |  |     '''
 | 
					
						
							| 
									
										
										
										
											2021-04-06 16:22:03 +08:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | def sort_in_descending_order(lst):# 单词按频率降序排列 | 
					
						
							|  |  |  |  |     lst2 = sorted(lst, reverse=True, key=lambda x: (x[1], x[0])) | 
					
						
							|  |  |  |  |     return lst2 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | def sort_in_ascending_order(lst):# 单词按频率降序排列 | 
					
						
							|  |  |  |  |     lst2 = sorted(lst, reverse=False, key=lambda x: (x[1], x[0])) | 
					
						
							|  |  |  |  |     return lst2 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | def make_html_page(lst, fname): | 
					
						
							|  |  |  |  |     '''
 | 
					
						
							|  |  |  |  |     功能:把lst的信息存到fname中,以html格式。 | 
					
						
							|  |  |  |  |     '''
 | 
					
						
							| 
									
										
										
										
											2022-06-06 19:42:26 +08:00
										 |  |  |  |     result = '' | 
					
						
							|  |  |  |  |     id = 1 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     for word in lst: | 
					
						
							|  |  |  |  |         result += '<p>' | 
					
						
							|  |  |  |  |         result += '%d ' % id | 
					
						
							| 
									
										
										
										
											2022-06-13 19:40:18 +08:00
										 |  |  |  |         result += get_html_hyperlink(word[0]) | 
					
						
							| 
									
										
										
										
											2022-06-06 19:42:26 +08:00
										 |  |  |  |         result += ' (%d)' % word[1] | 
					
						
							|  |  |  |  |         result += '</p>' | 
					
						
							|  |  |  |  |         # result += '<p>%d <a href="%s">%s</a> (%d)</p>' % (count, youdao_link(x[0]), x[0], x[1]) | 
					
						
							|  |  |  |  |         id += 1 | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-04-06 16:22:03 +08:00
										 |  |  |  |     f = open(fname, 'w') | 
					
						
							| 
									
										
										
										
											2022-06-06 19:42:26 +08:00
										 |  |  |  |     f.write(result) | 
					
						
							| 
									
										
										
										
											2021-04-06 16:22:03 +08:00
										 |  |  |  |     f.close() | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-06-11 09:56:54 +08:00
										 |  |  |  | def get_html_hyperlink(word): | 
					
						
							| 
									
										
										
										
											2022-06-06 19:42:26 +08:00
										 |  |  |  |     s = '<a href="' + youdao_link(word) + '">' + word + '</a>' | 
					
						
							|  |  |  |  |     return s | 
					
						
							| 
									
										
										
										
											2021-04-06 16:22:03 +08:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | ## main(程序入口) | 
					
						
							|  |  |  |  | if __name__ == '__main__': | 
					
						
							|  |  |  |  |     num = len(sys.argv) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     if num == 1: # 从键盘读入字符串 | 
					
						
							|  |  |  |  |         s = input() | 
					
						
							|  |  |  |  |     elif num == 2: # 从文件读入字符串 | 
					
						
							|  |  |  |  |         fname = sys.argv[1] | 
					
						
							|  |  |  |  |         s = file2str(fname) | 
					
						
							|  |  |  |  |     else: | 
					
						
							|  |  |  |  |         print('I can accept at most 2 arguments.') | 
					
						
							| 
									
										
										
										
											2022-06-06 19:42:26 +08:00
										 |  |  |  |         sys.exit() # 结束程序运行, 下面的代码不会被执行了。 | 
					
						
							| 
									
										
										
										
											2021-04-06 16:22:03 +08:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |     s = remove_punctuation(s) # 这里是s是实参(argument),里面有值 | 
					
						
							|  |  |  |  |     L = freq(s) | 
					
						
							|  |  |  |  |     for x in sort_in_descending_order(L): | 
					
						
							| 
									
										
										
										
											2022-06-06 19:42:26 +08:00
										 |  |  |  |         print('%s\t%d\t%s' % (x[0], x[1], youdao_link(x[0]))) # 函数导出 | 
					
						
							| 
									
										
										
										
											2021-04-06 16:22:03 +08:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |     # 把频率的结果放result.html中 | 
					
						
							|  |  |  |  |     make_html_page(sort_in_descending_order(L), 'result.html')  | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     print('\nHistory:\n') | 
					
						
							|  |  |  |  |     if os.path.exists('frequency.p'): | 
					
						
							|  |  |  |  |         d = pickle_idea.load_record('frequency.p') | 
					
						
							|  |  |  |  |     else: | 
					
						
							|  |  |  |  |         d = {} | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     print(sort_in_descending_order(pickle_idea.dict2lst(d))) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     # 合并频率 | 
					
						
							|  |  |  |  |     lst_history = pickle_idea.dict2lst(d) | 
					
						
							|  |  |  |  |     d = pickle_idea.merge_frequency(L, lst_history) | 
					
						
							|  |  |  |  |     pickle_idea.save_frequency_to_pickle(d, 'frequency.p') | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 |