1
0
Fork 0

Compare commits

...

1 Commits

1 changed files with 43 additions and 18 deletions

View File

@ -8,6 +8,7 @@
import pickle import pickle
import math import math
from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order
import snowballstemmer
def load_record(pickle_fname): def load_record(pickle_fname):
@ -18,6 +19,12 @@ def load_record(pickle_fname):
def difficulty_level_from_frequency(word, d): def difficulty_level_from_frequency(word, d):
"""
根据单词的频率进行难度的评级
:param word:
:param d:
:return:
"""
level = 1 level = 1
if not word in d: if not word in d:
return level return level
@ -30,26 +37,44 @@ def difficulty_level_from_frequency(word, d):
return level return level
def get_difficulty_level(d1, d2): def get_difficulty_level_for_words_and_tests(dic):
"""
对原本的单词库中的单词进行难度评级
:param dic: 存储了单词库pickle文件中的单词的字典
:return:
"""
d = {} d = {}
L = list(d1.keys()) # in d1, we have freuqence for each word L = list(dic.keys()) # in dic, we have test types (e.g., CET4,CET6,BBC) for each word
L2 = list(d2.keys()) # in d2, we have test types (e.g., CET4,CET6,BBC) for each word
L.extend(L2)
L3 = list(set(L)) # L3 contains all words
for k in L3:
if k in d2:
if 'CET4' in d2[k]:
d[k] = 4 # CET4 word has level 4
elif 'CET6' in d2[k]:
d[k] = 6
elif 'BBC' in d2[k]:
d[k] = 8
if k in d1: # BBC could contain easy words that are not in CET4 or CET6. So 4 is not reasonable. Recompute difficulty level.
d[k] = min(difficulty_level_from_frequency(k, d1), d[k])
elif k in d1:
d[k] = difficulty_level_from_frequency(k, d1)
return d for k in L:
if 'CET4' in dic[k]:
d[k] = 4 # CET4 word has level 4
elif 'CET6' in dic[k]:
d[k] = 6
elif 'BBC' in dic[k]:
d[k] = 8
print(k, d[k])
return d # {'apple': 4, ...}
def get_difficulty_level(d1, d2):
"""
d2 来自于词库的27000个已标记单词
d1 你个老六不会的词
"""
d2 = get_difficulty_level_for_words_and_tests(d2) # 根据标记评级,仅适用于词库中的词
stem = snowballstemmer.stemmer('english')
for k in d1: # k是用户不会的词
for l in d2: # l是已经完成评级的词库的词
if k == l: # k == l这个用户也不会的词刚好以原型的形式出现在词库中因为词库已经评过难度了所以啥也不用干
break
elif stem.stemWord(k) in l: # 这个词的词根与词库中的某个词一样,我们认为是同一难度的词
d1[k] = d2[l]
else: # 这个词不在词库中,按频率来评定难度
d2[k] = difficulty_level_from_frequency(k, d1)
return d2