''' Estimate a user's vocabulary level given his vocabulary data Estimate an English article's difficulty level given its content Preliminary design Hui, 2024-09-23 Last upated: 2024-09-25, 2024-09-30 ''' import pickle import re from collections import defaultdict def load_record(pickle_fname): with open(pickle_fname, 'rb') as f: d = pickle.load(f) return d class VocabularyLevelEstimator: _test = load_record('words_and_tests.p') # 单词到来源的映射 _source_levels = { # 来源到难度分数的映射 'BBC': 1, 'CET4': 2, 'CET6': 3, 'GRADUATE': 4, 'OXFORD3000': 1, 'TOEFL': 5, 'IELTS': 5, 'GRE': 7 } def get_word_level(self, word): """获取单词难度分数""" if word in self._test: sources = self._test[word] word_levels = [ self._source_levels[src] for src in sources if src in self._source_levels ] if word_levels: # 使用最高分 return max(word_levels) return 0 # 未知单词难度为0 class UserVocabularyLevel(VocabularyLevelEstimator): def __init__(self, d, recent_count=3): self.d = d # 按时间戳排序(最新的在前) sorted_words = sorted(d.items(), key=lambda x: max(x[1]), reverse=True) # 取最近的单词(默认3个) self.word_lst = [word for word, _ in sorted_words[:recent_count]] @property def level(self): if not self.word_lst: return 0.0 # 使用最高分 max_score = 0 for word in self.word_lst: score = self.get_word_level(word) if score > max_score: max_score = score return max_score class ArticleVocabularyLevel(VocabularyLevelEstimator): def __init__(self, content): self.content = content # 更智能的分词,处理连字符和缩写 words = re.findall(r'\b[\w-]+\b', content.lower()) # 计算每个单词的频率和分数 word_freq = defaultdict(int) word_scores = {} for word in words: if word.isalpha(): word_freq[word] += 1 if word not in word_scores: word_scores[word] = self.get_word_level(word) # 计算加权分数(频率 * 分数) weighted_scores = [] for word, score in word_scores.items(): if score > 0: weighted_scores.append((score * word_freq[word], score, word)) # 如果没有有效单词,直接返回 if not weighted_scores: self.difficult_words = [] return # 按加权分数排序 weighted_scores.sort(reverse=True) # 只保留前20%的单词(至少5个,最多15个) num_top_words = max(5, min(15, len(weighted_scores) // 5)) self.difficult_words = [score for _, score, _ in weighted_scores[:num_top_words]] @property def level(self): if not self.difficult_words: return 0.0 # 使用最高分 return max(self.difficult_words) if __name__ == '__main__': d = load_record('frequency_mrlan85.pickle') print(d) user = UserVocabularyLevel(d) print(user.level) # level is a property article = ArticleVocabularyLevel('This is an interesting article') print(article.level)