diff --git a/app/difficulty.py b/app/difficulty.py index 50aa179..34e7717 100644 --- a/app/difficulty.py +++ b/app/difficulty.py @@ -8,6 +8,7 @@ import pickle import math from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order +import snowballstemmer def load_record(pickle_fname): @@ -18,6 +19,12 @@ def load_record(pickle_fname): def difficulty_level_from_frequency(word, d): + """ + 根据单词的频率进行难度的评级 + :param word: + :param d: + :return: + """ level = 1 if not word in d: return level @@ -30,26 +37,44 @@ def difficulty_level_from_frequency(word, d): return level -def get_difficulty_level(d1, d2): +def get_difficulty_level_for_words_and_tests(dic): + """ + 对原本的单词库中的单词进行难度评级 + :param dic: 存储了单词库pickle文件中的单词的字典 + :return: + """ d = {} - L = list(d1.keys()) # in d1, we have freuqence for each word - L2 = list(d2.keys()) # in d2, we have test types (e.g., CET4,CET6,BBC) for each word - L.extend(L2) - L3 = list(set(L)) # L3 contains all words - for k in L3: - if k in d2: - if 'CET4' in d2[k]: - d[k] = 4 # CET4 word has level 4 - elif 'CET6' in d2[k]: - d[k] = 6 - elif 'BBC' in d2[k]: - d[k] = 8 - if k in d1: # BBC could contain easy words that are not in CET4 or CET6. So 4 is not reasonable. Recompute difficulty level. - d[k] = min(difficulty_level_from_frequency(k, d1), d[k]) - elif k in d1: - d[k] = difficulty_level_from_frequency(k, d1) + L = list(dic.keys()) # in dic, we have test types (e.g., CET4,CET6,BBC) for each word - return d + for k in L: + if 'CET4' in dic[k]: + d[k] = 4 # CET4 word has level 4 + elif 'CET6' in dic[k]: + d[k] = 6 + elif 'BBC' in dic[k]: + d[k] = 8 + print(k, d[k]) + + return d # {'apple': 4, ...} + +def get_difficulty_level(d1, d2): + """ + d2 来自于词库的27000个已标记单词 + d1 你个老六不会的词 + """ + d2 = get_difficulty_level_for_words_and_tests(d2) # 根据标记评级,仅适用于词库中的词 + stem = snowballstemmer.stemmer('english') + + for k in d1: # k是用户不会的词 + for l in d2: # l是已经完成评级的词库的词 + if k == l: # k == l,这个用户也不会的词刚好以原型的形式出现在词库中,因为词库已经评过难度了,所以啥也不用干 + break + elif stem.stemWord(k) in l: # 这个词的词根与词库中的某个词一样,我们认为是同一难度的词 + d1[k] = d2[l] + else: # 这个词不在词库中,按频率来评定难度 + d2[k] = difficulty_level_from_frequency(k, d1) + + return d2