########################################################################### # Copyright 2019 (C) Hui Lan # Written permission must be obtained from the author for commercial uses. ########################################################################### # Purpose: compute difficulty level of a English text import pickle import math from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order, map_percentages_to_levels import snowballstemmer def load_record(pickle_fname): with open(pickle_fname, 'rb') as f: d = pickle.load(f) return d ENGLISH_WORD_DIFFICULTY_DICT = {} def convert_test_type_to_difficulty_level(d): """ 对原本的单词库中的单词进行难度评级 :param d: 存储了单词库pickle文件中的单词的字典 :return: """ result = {} L = list(d.keys()) # in d, we have test types (e.g., CET4,CET6,BBC) for each word for k in L: if 'CET4' in d[k]: result[k] = 4 # CET4 word has level 4 elif 'OXFORD3000' in d[k]: result[k] = 5 elif 'CET6' in d[k] or 'GRADUATE' in d[k]: result[k] = 6 elif 'OXFORD5000' in d[k] or 'IELTS' in d[k]: result[k] = 7 elif 'BBC' in d[k]: result[k] = 8 global ENGLISH_WORD_DIFFICULTY_DICT ENGLISH_WORD_DIFFICULTY_DICT = result return result # {'apple': 4, ...} def get_difficulty_level_for_user(d1, d2): """ d2 来自于词库的35511个已标记单词 d1 用户不会的词 在d2的后面添加单词,没有新建一个新的字典 """ # TODO: convert_test_type_to_difficulty_level() should not be called every time. Each word's difficulty level should be pre-computed. if ENGLISH_WORD_DIFFICULTY_DICT == {}: d2 = convert_test_type_to_difficulty_level(d2) # 根据d2的标记评级{'apple': 4, 'abandon': 4, ...} else: d2 = ENGLISH_WORD_DIFFICULTY_DICT stemmer = snowballstemmer.stemmer('english') for k in d1: # 用户的词 if k in d2: # 如果用户的词以原型的形式存在于词库d2中 continue # 无需评级,跳过 else: stem = stemmer.stemWord(k) if stem in d2: # 如果用户的词的词根存在于词库d2的词根库中 d2[k] = d2[stem] # 按照词根进行评级 else: d2[k] = 3 # 如果k的词根都不在,那么就当认为是3级 return d2 def revert_dict(d): ''' In d, word is the key, and value is a list of dates. In d2 (the returned value of this function), time is the key, and the value is a list of words picked at that time. ''' d2 = {} for k in d: if type(d[k]) is list: # d[k] is a list of dates. lst = d[k] elif type(d[ k]) is int: # for backward compatibility. d was sth like {'word':1}. The value d[k] is not a list of dates, but a number representing how frequent this word had been added to the new word book. freq = d[k] lst = freq * ['2021082019'] # why choose this date? No particular reasons. I fix the bug in this date. for time_info in lst: date = time_info[:10] # until hour if not date in d2: d2[date] = [k] else: d2[date].append(k) return d2 class VocabularyLevelEstimator: _test = load_record('words_and_tests.p') # map a word to the sources where it appears @property def level(self): total = 0.0 # TODO: need to compute this number num = 1 for word in self.word_lst: num += 1 if word in self._test: print(f'{word} : {self._test[word]}') else: print(f'{word}') return total/num class UserVocabularyLevel(VocabularyLevelEstimator): def __init__(self, d): self.d = d self.word_lst = list(d.keys()) # just look at the most recently-added words class ArticleVocabularyLevel(VocabularyLevelEstimator): def __init__(self, content): self.content = content self.word_lst = content.lower().split() # select the 10 most difficult words if __name__ == '__main__': d = load_record('frequency_mrlan85.pickle') print(d) user = UserVocabularyLevel(d) print(user.level) # level is a property article = ArticleVocabularyLevel('This is an interesting article') print(article.level)