forked from mrlan/EnglishPal
				
			新建了get_difficulty_level_for_words_and_tests函数专门用于给词库单词评级,返回得到一个字典d2({'apple': 4, 'banana': 4, ...});修改get_difficulty_level函数,给用户的单词评级, 分三种情况:1.以原型出现,则无需操作,因为已经出现在d2;2.词根与词库中某单词词根相同,视为一个难度的单词;3.以上两种情况之外的词视为不在词库中的词,按照difficulty_level_from_frequency函数方法评定难度。
目前还未进行测试。SPM2023-PR44-YuGaoxiang
							parent
							
								
									43c719b6b2
								
							
						
					
					
						commit
						ce9e18e3fe
					
				|  | @ -8,6 +8,7 @@ | ||||||
| import pickle | import pickle | ||||||
| import math | import math | ||||||
| from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order | from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order | ||||||
|  | import snowballstemmer | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def load_record(pickle_fname): | def load_record(pickle_fname): | ||||||
|  | @ -18,6 +19,12 @@ def load_record(pickle_fname): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def difficulty_level_from_frequency(word, d): | def difficulty_level_from_frequency(word, d): | ||||||
|  |     """ | ||||||
|  |     根据单词的频率进行难度的评级 | ||||||
|  |     :param word: | ||||||
|  |     :param d: | ||||||
|  |     :return: | ||||||
|  |     """ | ||||||
|     level = 1 |     level = 1 | ||||||
|     if not word in d: |     if not word in d: | ||||||
|         return level |         return level | ||||||
|  | @ -30,26 +37,44 @@ def difficulty_level_from_frequency(word, d): | ||||||
|     return level |     return level | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def get_difficulty_level(d1, d2): | def get_difficulty_level_for_words_and_tests(dic): | ||||||
|  |     """ | ||||||
|  |     对原本的单词库中的单词进行难度评级 | ||||||
|  |     :param dic: 存储了单词库pickle文件中的单词的字典 | ||||||
|  |     :return: | ||||||
|  |     """ | ||||||
|     d = {} |     d = {} | ||||||
|     L = list(d1.keys())  # in d1, we have freuqence for each word |     L = list(dic.keys())  # in dic, we have test types (e.g., CET4,CET6,BBC) for each word | ||||||
|     L2 = list(d2.keys()) # in d2, we have test types (e.g., CET4,CET6,BBC) for each word |  | ||||||
|     L.extend(L2) |  | ||||||
|     L3 = list(set(L)) # L3 contains all words |  | ||||||
|     for k in L3: |  | ||||||
|         if k in d2: |  | ||||||
|             if 'CET4' in d2[k]: |  | ||||||
|                 d[k] = 4 # CET4 word has level 4 |  | ||||||
|             elif 'CET6' in d2[k]: |  | ||||||
|                 d[k] = 6 |  | ||||||
|             elif 'BBC' in d2[k]: |  | ||||||
|                 d[k] = 8 |  | ||||||
|                 if k in d1: # BBC could contain easy words that are not in CET4 or CET6.  So 4 is not reasonable.  Recompute difficulty level. |  | ||||||
|                     d[k] = min(difficulty_level_from_frequency(k, d1), d[k]) |  | ||||||
|         elif k in d1: |  | ||||||
|             d[k] = difficulty_level_from_frequency(k, d1) |  | ||||||
| 
 | 
 | ||||||
|     return d |     for k in L: | ||||||
|  |         if 'CET4' in dic[k]: | ||||||
|  |             d[k] = 4  # CET4 word has level 4 | ||||||
|  |         elif 'CET6' in dic[k]: | ||||||
|  |             d[k] = 6 | ||||||
|  |         elif 'BBC' in dic[k]: | ||||||
|  |             d[k] = 8 | ||||||
|  |         print(k, d[k]) | ||||||
|  | 
 | ||||||
|  |     return d  # {'apple': 4, ...} | ||||||
|  | 
 | ||||||
|  | def get_difficulty_level(d1, d2): | ||||||
|  |     """ | ||||||
|  |     d2 来自于词库的27000个已标记单词 | ||||||
|  |     d1 你个老六不会的词 | ||||||
|  |     """ | ||||||
|  |     d2 = get_difficulty_level_for_words_and_tests(d2)  # 根据标记评级,仅适用于词库中的词 | ||||||
|  |     stem = snowballstemmer.stemmer('english') | ||||||
|  | 
 | ||||||
|  |     for k in d1:        # k是用户不会的词 | ||||||
|  |         for l in d2:     # l是已经完成评级的词库的词 | ||||||
|  |             if k == l:  # k == l,这个用户也不会的词刚好以原型的形式出现在词库中,因为词库已经评过难度了,所以啥也不用干 | ||||||
|  |                 break | ||||||
|  |             elif stem.stemWord(k) in l:     # 这个词的词根与词库中的某个词一样,我们认为是同一难度的词 | ||||||
|  |                 d1[k] = d2[l] | ||||||
|  |             else:   # 这个词不在词库中,按频率来评定难度 | ||||||
|  |                 d2[k] = difficulty_level_from_frequency(k, d1) | ||||||
|  | 
 | ||||||
|  |     return d2 | ||||||
| 
 | 
 | ||||||
|          |          | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue