diff --git a/app/vocabulary.py b/app/vocabulary.py index 73864d7..c8f3cf2 100644 --- a/app/vocabulary.py +++ b/app/vocabulary.py @@ -1,12 +1,57 @@ +""" +词汇难度评估系统 +功能:根据单词在不同考试中的出现情况评估其难度级别,并计算用户或文章的词汇水平 +""" + import re import pickle +from typing import Dict, List, Tuple, Union -def load_record(pickle_fname): - with open(pickle_fname, 'rb') as f: - d = pickle.load(f) - return d +# 预编译正则表达式提高性能 +WORD_PATTERN = re.compile(r'\b[\w-]+\b') -def convert_test_type_to_difficulty_level(d): + +def load_record(pickle_fname: str) -> Dict[str, List[str]]: + """ + 加载pickle格式的单词-考试类型数据 + + 参数: + pickle_fname: pickle文件名 + + 返回: + 字典格式的单词到考试类型列表的映射 + + 异常: + FileNotFoundError: 当文件不存在时抛出 + ValueError: 当pickle文件损坏时抛出 + """ + try: + with open(pickle_fname, 'rb') as f: + return pickle.load(f) + except FileNotFoundError: + raise FileNotFoundError(f"Pickle文件 {pickle_fname} 未找到") + except pickle.PickleError: + raise ValueError(f"Pickle文件 {pickle_fname} 损坏或格式不正确") + + +def convert_test_type_to_difficulty_level(d: Dict[str, List[str]]) -> Dict[str, int]: + """ + 将考试类型映射为难度级别 + + 难度级别定义: + 0: 未知/未分类 + 4: CET4 + 5: OXFORD3000 + 6: CET6或GRADUATE + 7: IELTS或OXFORD5000 + 8: BBC + + 参数: + d: 单词到考试类型列表的映射 + + 返回: + 单词到难度级别的映射 + """ result = {} for word, test_types in d.items(): if 'CET4' in test_types: @@ -25,37 +70,110 @@ def convert_test_type_to_difficulty_level(d): result[word] = 0 return result + class VocabularyLevelEstimator: - _test_raw = load_record('words_and_tests.p') - _difficulty_dict = convert_test_type_to_difficulty_level(_test_raw) + """ + 词汇难度评估基类 + 使用预定义的单词-考试类型数据评估单词难度级别 + + 类属性: + _test_raw: 原始单词-考试类型数据 + _difficulty_dict: 转换后的单词-难度级别映射 + """ + _test_raw = None + _difficulty_dict = None @classmethod - def get_word_level(cls, word): + def _load_data(cls): + """延迟加载数据,避免不必要的文件操作""" + if cls._test_raw is None: + cls._test_raw = load_record('words_and_tests.p') + cls._difficulty_dict = convert_test_type_to_difficulty_level(cls._test_raw) + + @classmethod + def get_word_level(cls, word: str) -> int: + """ + 获取单词难度级别 + + 参数: + word: 要查询的单词 + + 返回: + 单词的难度级别(0-8) + """ + cls._load_data() return cls._difficulty_dict.get(word, 0) + class UserVocabularyLevel(VocabularyLevelEstimator): - def __init__(self, d): + """ + 用户词汇水平评估 + 根据用户最近查询的单词评估其词汇水平 + """ + + def __init__(self, d: Dict[str, List[int]]): + """ + 初始化用户词汇数据 + + 参数: + d: 单词到时间戳列表的映射 + """ self.d = d + # 获取每个单词的最新查询时间并排序 word_time = [(word, max(times)) for word, times in d.items() if times] sorted_words = sorted(word_time, key=lambda x: x[1], reverse=True) self.recent_words = [word for word, _ in sorted_words[:3]] @property - def level(self): + def level(self) -> float: + """ + 计算用户词汇水平 + + 返回: + 最近查询的有效单词的平均难度级别 + 如果没有有效单词则返回0 + """ levels = [self.get_word_level(word) for word in self.recent_words] valid_levels = [lvl for lvl in levels if lvl > 0] return sum(valid_levels) / len(valid_levels) if valid_levels else 0 + class ArticleVocabularyLevel(VocabularyLevelEstimator): - def __init__(self, content): + """ + 文章词汇水平评估 + 根据文章中出现的最高难度单词评估文章词汇水平 + """ + + def __init__(self, content: str): + """ + 初始化文章内容 + + 参数: + content: 文章内容字符串 + + 异常: + ValueError: 当内容为空或不是字符串时抛出 + """ + if not content or not isinstance(content, str): + raise ValueError("文章内容必须是非空字符串") + self.content = content - words = re.findall(r'\b[\w-]+\b', content.lower()) + # 提取所有单词并计算难度 + words = WORD_PATTERN.findall(content.lower()) word_levels = [self.get_word_level(word) for word in words] + # 筛选有效难度并排序 valid_levels = sorted([lvl for lvl in word_levels if lvl > 0], reverse=True) - self.top_levels = valid_levels[:5] # 取前5个最高难度单词 + self.top_levels = valid_levels[:5] if valid_levels else [] @property - def level(self): + def level(self) -> float: + """ + 计算文章词汇水平 + + 返回: + 文章中最难5个单词的平均难度级别 + 如果没有有效单词则返回0 + """ if not self.top_levels: return 0 - return sum(self.top_levels) / len(self.top_levels) + return sum(self.top_levels) / len(self.top_levels) \ No newline at end of file