From 72fa5127b4b4d9202045fca4b056a370dd089620 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8E=AB=E6=96=AF=E7=A7=91=E9=A3=8E=E9=9B=AA=E5=B1=B1?= =?UTF-8?q?=E7=A5=9E=E5=BA=99?= <1400146525@qq.com> Date: Wed, 2 Jul 2025 06:10:09 +0800 Subject: [PATCH] Added a vocabulary.py,fixed bug585...likely,at least it works --- app/vocabulary.py | 136 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 app/vocabulary.py diff --git a/app/vocabulary.py b/app/vocabulary.py new file mode 100644 index 0000000..2bfc197 --- /dev/null +++ b/app/vocabulary.py @@ -0,0 +1,136 @@ +########################################################################### +# Copyright 2019 (C) Hui Lan +# Written permission must be obtained from the author for commercial uses. +########################################################################### + +# Purpose: compute difficulty level of a English text + +import pickle +import math +from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order, map_percentages_to_levels +import snowballstemmer + + +def load_record(pickle_fname): + with open(pickle_fname, 'rb') as f: + d = pickle.load(f) + return d + + +ENGLISH_WORD_DIFFICULTY_DICT = {} +def convert_test_type_to_difficulty_level(d): + """ + 对原本的单词库中的单词进行难度评级 + :param d: 存储了单词库pickle文件中的单词的字典 + :return: + """ + result = {} + L = list(d.keys()) # in d, we have test types (e.g., CET4,CET6,BBC) for each word + + for k in L: + if 'CET4' in d[k]: + result[k] = 4 # CET4 word has level 4 + elif 'OXFORD3000' in d[k]: + result[k] = 5 + elif 'CET6' in d[k] or 'GRADUATE' in d[k]: + result[k] = 6 + elif 'OXFORD5000' in d[k] or 'IELTS' in d[k]: + result[k] = 7 + elif 'BBC' in d[k]: + result[k] = 8 + + global ENGLISH_WORD_DIFFICULTY_DICT + ENGLISH_WORD_DIFFICULTY_DICT = result + + return result # {'apple': 4, ...} + +def get_difficulty_level_for_user(d1, d2): + """ + d2 来自于词库的35511个已标记单词 + d1 用户不会的词 + 在d2的后面添加单词,没有新建一个新的字典 + """ + # TODO: convert_test_type_to_difficulty_level() should not be called every time. Each word's difficulty level should be pre-computed. + if ENGLISH_WORD_DIFFICULTY_DICT == {}: + d2 = convert_test_type_to_difficulty_level(d2) # 根据d2的标记评级{'apple': 4, 'abandon': 4, ...} + else: + d2 = ENGLISH_WORD_DIFFICULTY_DICT + + stemmer = snowballstemmer.stemmer('english') + + for k in d1: # 用户的词 + if k in d2: # 如果用户的词以原型的形式存在于词库d2中 + continue # 无需评级,跳过 + else: + stem = stemmer.stemWord(k) + if stem in d2: # 如果用户的词的词根存在于词库d2的词根库中 + d2[k] = d2[stem] # 按照词根进行评级 + else: + d2[k] = 3 # 如果k的词根都不在,那么就当认为是3级 + return d2 + + +def revert_dict(d): + ''' + In d, word is the key, and value is a list of dates. + In d2 (the returned value of this function), time is the key, and the value is a list of words picked at that time. + ''' + d2 = {} + for k in d: + if type(d[k]) is list: # d[k] is a list of dates. + lst = d[k] + elif type(d[ + k]) is int: # for backward compatibility. d was sth like {'word':1}. The value d[k] is not a list of dates, but a number representing how frequent this word had been added to the new word book. + freq = d[k] + lst = freq * ['2021082019'] # why choose this date? No particular reasons. I fix the bug in this date. + + for time_info in lst: + date = time_info[:10] # until hour + if not date in d2: + d2[date] = [k] + else: + d2[date].append(k) + return d2 + + +class VocabularyLevelEstimator: + _test = load_record('words_and_tests.p') # map a word to the sources where it appears + + @property + def level(self): + total = 0.0 # TODO: need to compute this number + num = 1 + for word in self.word_lst: + num += 1 + if word in self._test: + print(f'{word} : {self._test[word]}') + else: + print(f'{word}') + return total/num + +class UserVocabularyLevel(VocabularyLevelEstimator): + def __init__(self, d): + self.d = d + self.word_lst = list(d.keys()) + # just look at the most recently-added words + + + +class ArticleVocabularyLevel(VocabularyLevelEstimator): + def __init__(self, content): + self.content = content + self.word_lst = content.lower().split() + # select the 10 most difficult words + + +if __name__ == '__main__': + d = load_record('frequency_mrlan85.pickle') + print(d) + user = UserVocabularyLevel(d) + print(user.level) # level is a property + article = ArticleVocabularyLevel('This is an interesting article') + print(article.level) + + + +