''' Estimate a user's vocabulary level given his vocabulary data Estimate an English article's difficulty level given its content Preliminary design Hui, 2024-09-23 Last upated: 2024-09-25, 2024-09-30 ''' import pickle import nltk DIFFICULTY_MAPPING = { 'BBC': 2, # 基础词汇 'CET4': 3, # 四级(大学英语) 'CET6': 4, # 六级 'GRADUATE': 5, # 考研词汇 'IELTS': 6, # 雅思 'OXFORD3000': 4, # 牛津3000核心词 'OXFORD5000': 7 # 牛津5000词 } def load_record(pickle_fname): with open(pickle_fname, 'rb') as f: d = pickle.load(f) return d class VocabularyLevelEstimator: _test = load_record('words_and_tests.p') # map a word to the sources where it appears @property def level(self): if not self.word_lst: # 检查是否有有效词汇 return 0.0 # 或根据需求返回默认值 total = 0.0 valid_words = 0 for word in self.word_lst: if word in self._test: sources = self._test[word] total += max(DIFFICULTY_MAPPING.get(src, 0) for src in sources) valid_words += 1 return total / valid_words if valid_words > 0 else 0.0 class UserVocabularyLevel(VocabularyLevelEstimator): def __init__(self, d, recent_n=3): self.d = d self.recent_n = recent_n # 按时间戳降序排序,取前recent_n个单词 sorted_words = sorted(d.keys(), key=lambda word: d[word][-1], reverse=True) self.word_lst = sorted_words[:recent_n] class ArticleVocabularyLevel(VocabularyLevelEstimator): def __init__(self, content): self.content = content # 预处理:分词、小写、去标点、去停用词 import re from nltk.corpus import stopwords nltk.download('stopwords') stop_words = set(stopwords.words('english')) words = re.findall(r'\b\w+\b', content.lower()) self.word_lst = [word for word in words if word not in stop_words] # 按难度分筛选前10个最难的单词 self.word_lst = sorted( self.word_lst, key=lambda w: self._get_difficulty(w), reverse=True )[:10] def _get_difficulty(self, word): if word in self._test: return max(DIFFICULTY_MAPPING.get(src, 0) for src in self._test[word]) return 0 if __name__ == '__main__': d = load_record('frequency_mrlan85.pickle') print(d) print("======================================================") user = UserVocabularyLevel(d) print(user.level) # level is a property print("======================================================") article = ArticleVocabularyLevel('This is an interesting article') print(article.level)