Compare commits

..

2 Commits

Author SHA1 Message Date
包月琳 f40a968a17 删除 'Bug476-ZhangWeiHao-BaoYuelin' 2023-05-10 18:59:30 +08:00
包月琳 59a1fe607a 添加 'Bug476-ZhangWeiHao-BaoYuelin' 2023-05-10 18:55:48 +08:00
1 changed files with 18 additions and 43 deletions

View File

@ -8,7 +8,6 @@
import pickle import pickle
import math import math
from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order
import snowballstemmer
def load_record(pickle_fname): def load_record(pickle_fname):
@ -19,12 +18,6 @@ def load_record(pickle_fname):
def difficulty_level_from_frequency(word, d): def difficulty_level_from_frequency(word, d):
"""
根据单词的频率进行难度的评级
:param word:
:param d:
:return:
"""
level = 1 level = 1
if not word in d: if not word in d:
return level return level
@ -37,44 +30,26 @@ def difficulty_level_from_frequency(word, d):
return level return level
def get_difficulty_level_for_words_and_tests(dic):
"""
对原本的单词库中的单词进行难度评级
:param dic: 存储了单词库pickle文件中的单词的字典
:return:
"""
d = {}
L = list(dic.keys()) # in dic, we have test types (e.g., CET4,CET6,BBC) for each word
for k in L:
if 'CET4' in dic[k]:
d[k] = 4 # CET4 word has level 4
elif 'CET6' in dic[k]:
d[k] = 6
elif 'BBC' in dic[k]:
d[k] = 8
print(k, d[k])
return d # {'apple': 4, ...}
def get_difficulty_level(d1, d2): def get_difficulty_level(d1, d2):
""" d = {}
d2 来自于词库的27000个已标记单词 L = list(d1.keys()) # in d1, we have freuqence for each word
d1 你个老六不会的词 L2 = list(d2.keys()) # in d2, we have test types (e.g., CET4,CET6,BBC) for each word
""" L.extend(L2)
d2 = get_difficulty_level_for_words_and_tests(d2) # 根据标记评级,仅适用于词库中的词 L3 = list(set(L)) # L3 contains all words
stem = snowballstemmer.stemmer('english') for k in L3:
if k in d2:
if 'CET4' in d2[k]:
d[k] = 4 # CET4 word has level 4
elif 'CET6' in d2[k]:
d[k] = 6
elif 'BBC' in d2[k]:
d[k] = 8
if k in d1: # BBC could contain easy words that are not in CET4 or CET6. So 4 is not reasonable. Recompute difficulty level.
d[k] = min(difficulty_level_from_frequency(k, d1), d[k])
elif k in d1:
d[k] = difficulty_level_from_frequency(k, d1)
for k in d1: # k是用户不会的词 return d
for l in d2: # l是已经完成评级的词库的词
if k == l: # k == l这个用户也不会的词刚好以原型的形式出现在词库中因为词库已经评过难度了所以啥也不用干
break
elif stem.stemWord(k) in l: # 这个词的词根与词库中的某个词一样,我们认为是同一难度的词
d1[k] = d2[l]
else: # 这个词不在词库中,按频率来评定难度
d2[k] = difficulty_level_from_frequency(k, d1)
return d2