From 375b46cfb5225d27ecf6e1136618379be535325c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BA=8E=E6=B4=8B?= <2844005104@qq.com> Date: Thu, 29 May 2025 14:46:06 +0800 Subject: [PATCH] =?UTF-8?q?=E8=AF=B7=E8=80=81=E5=B8=88=E7=9C=8B=E6=88=91?= =?UTF-8?q?=E4=BB=AC=E7=9A=84vocabulary.py=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/vocabulary.py | 91 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 app/vocabulary.py diff --git a/app/vocabulary.py b/app/vocabulary.py new file mode 100644 index 0000000..179aa77 --- /dev/null +++ b/app/vocabulary.py @@ -0,0 +1,91 @@ +''' + Estimate a user's vocabulary level given his vocabulary data + Estimate an English article's difficulty level given its content + Preliminary design + + Hui, 2024-09-23 + Last upated: 2024-09-25, 2024-09-30 +''' + +import pickle + +import nltk + +DIFFICULTY_MAPPING = { + 'BBC': 2, # 基础词汇 + 'CET4': 3, # 四级(大学英语) + 'CET6': 4, # 六级 + 'GRADUATE': 5, # 考研词汇 + 'IELTS': 6, # 雅思 + 'OXFORD3000': 4, # 牛津3000核心词 + 'OXFORD5000': 7 # 牛津5000词 +} + + +def load_record(pickle_fname): + with open(pickle_fname, 'rb') as f: + d = pickle.load(f) + return d + + +class VocabularyLevelEstimator: + _test = load_record('words_and_tests.p') # map a word to the sources where it appears + + @property + def level(self): + if not self.word_lst: # 检查是否有有效词汇 + return 0.0 # 或根据需求返回默认值 + total = 0.0 + valid_words = 0 + for word in self.word_lst: + if word in self._test: + sources = self._test[word] + total += max(DIFFICULTY_MAPPING.get(src, 0) for src in sources) + valid_words += 1 + return total / valid_words if valid_words > 0 else 0.0 + + +class UserVocabularyLevel(VocabularyLevelEstimator): + def __init__(self, d, recent_n=3): + self.d = d + self.recent_n = recent_n + # 按时间戳降序排序,取前recent_n个单词 + sorted_words = sorted(d.keys(), key=lambda word: d[word][-1], reverse=True) + self.word_lst = sorted_words[:recent_n] + + +class ArticleVocabularyLevel(VocabularyLevelEstimator): + def __init__(self, content): + self.content = content + # 预处理:分词、小写、去标点、去停用词 + import re + from nltk.corpus import stopwords + nltk.download('stopwords') + stop_words = set(stopwords.words('english')) + words = re.findall(r'\b\w+\b', content.lower()) + self.word_lst = [word for word in words if word not in stop_words] + # 按难度分筛选前10个最难的单词 + self.word_lst = sorted( + self.word_lst, + key=lambda w: self._get_difficulty(w), + reverse=True + )[:10] + + def _get_difficulty(self, word): + if word in self._test: + return max(DIFFICULTY_MAPPING.get(src, 0) for src in self._test[word]) + return 0 + + +if __name__ == '__main__': + d = load_record('frequency_mrlan85.pickle') + print(d) + print("======================================================") + user = UserVocabularyLevel(d) + print(user.level) # level is a property + print("======================================================") + article = ArticleVocabularyLevel('This is an interesting article') + print(article.level) + + + -- 2.17.1