From 4a5fc9a7ce6c6d5c6fde6ffea813be9db70e96b5 Mon Sep 17 00:00:00 2001 From: AsWhitale <1838528479@qq.com> Date: Fri, 6 Jun 2025 21:00:29 +0800 Subject: [PATCH] =?UTF-8?q?code=20review=201.=E8=B0=83=E6=95=B4=E4=BA=86?= =?UTF-8?q?=E4=B8=80=E4=BA=9B=E6=96=B9=E6=B3=95=E5=92=8C=E7=B1=BB=E7=9A=84?= =?UTF-8?q?=E5=85=B3=E7=B3=BB=202.=E4=BC=98=E5=8C=96=E4=BA=86=E6=95=B0?= =?UTF-8?q?=E6=8D=AE=E5=8A=A0=E8=BD=BD=203.=E4=BF=AE=E6=AD=A3=E4=BA=86?= =?UTF-8?q?=E9=94=99=E8=AF=AF=E7=9A=84=E6=96=87=E4=BB=B6=E8=B7=AF=E5=BE=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/vocabulary.py | 195 +++++++++++++++++++++++++--------------------- 1 file changed, 106 insertions(+), 89 deletions(-) diff --git a/app/vocabulary.py b/app/vocabulary.py index c8f3cf2..029d905 100644 --- a/app/vocabulary.py +++ b/app/vocabulary.py @@ -1,76 +1,13 @@ -""" -词汇难度评估系统 -功能:根据单词在不同考试中的出现情况评估其难度级别,并计算用户或文章的词汇水平 -""" - import re import pickle +import os from typing import Dict, List, Tuple, Union +from collections import Counter # 预编译正则表达式提高性能 WORD_PATTERN = re.compile(r'\b[\w-]+\b') -def load_record(pickle_fname: str) -> Dict[str, List[str]]: - """ - 加载pickle格式的单词-考试类型数据 - - 参数: - pickle_fname: pickle文件名 - - 返回: - 字典格式的单词到考试类型列表的映射 - - 异常: - FileNotFoundError: 当文件不存在时抛出 - ValueError: 当pickle文件损坏时抛出 - """ - try: - with open(pickle_fname, 'rb') as f: - return pickle.load(f) - except FileNotFoundError: - raise FileNotFoundError(f"Pickle文件 {pickle_fname} 未找到") - except pickle.PickleError: - raise ValueError(f"Pickle文件 {pickle_fname} 损坏或格式不正确") - - -def convert_test_type_to_difficulty_level(d: Dict[str, List[str]]) -> Dict[str, int]: - """ - 将考试类型映射为难度级别 - - 难度级别定义: - 0: 未知/未分类 - 4: CET4 - 5: OXFORD3000 - 6: CET6或GRADUATE - 7: IELTS或OXFORD5000 - 8: BBC - - 参数: - d: 单词到考试类型列表的映射 - - 返回: - 单词到难度级别的映射 - """ - result = {} - for word, test_types in d.items(): - if 'CET4' in test_types: - result[word] = 4 - elif 'OXFORD3000' in test_types: - result[word] = 5 - elif 'CET6' in test_types or 'GRADUATE' in test_types: - result[word] = 6 - elif 'IELTS' in test_types: - result[word] = 7 - elif 'OXFORD5000' in test_types: - result[word] = 7 - elif 'BBC' in test_types: - result[word] = 8 - else: - result[word] = 0 - return result - - class VocabularyLevelEstimator: """ 词汇难度评估基类 @@ -82,13 +19,77 @@ class VocabularyLevelEstimator: """ _test_raw = None _difficulty_dict = None + PICKLE_PATH = 'static/words_and_tests.p' # 默认数据文件路径 @classmethod def _load_data(cls): """延迟加载数据,避免不必要的文件操作""" if cls._test_raw is None: - cls._test_raw = load_record('words_and_tests.p') - cls._difficulty_dict = convert_test_type_to_difficulty_level(cls._test_raw) + cls._test_raw = cls.load_record(cls.PICKLE_PATH) + cls._difficulty_dict = cls.convert_test_type_to_difficulty_level(cls._test_raw) + + @staticmethod + def load_record(pickle_fname: str) -> Dict[str, List[str]]: + """ + 加载pickle格式的单词-考试类型数据 + + 参数: + pickle_fname: pickle文件名 + + 返回: + 字典格式的单词到考试类型列表的映射 + + 异常: + FileNotFoundError: 当文件不存在时抛出 + ValueError: 当pickle文件损坏时抛出 + """ + try: + # 文件校验 + if not os.path.exists(pickle_fname): + raise FileNotFoundError(f"词汇数据文件 {pickle_fname} 未找到") + if not pickle_fname.endswith('.p'): + raise ValueError("仅支持.pickle格式文件") + + with open(pickle_fname, 'rb') as f: + return pickle.load(f) + except pickle.PickleError as e: + raise ValueError(f"Pickle文件 {pickle_fname} 损坏: {str(e)}") + + @staticmethod + def convert_test_type_to_difficulty_level(d: Dict[str, List[str]]) -> Dict[str, int]: + """ + 将考试类型映射为难度级别 + + 难度级别定义: + 0: 未知/未分类 + 4: CET4 + 5: OXFORD3000 + 6: CET6或GRADUATE + 7: IELTS或OXFORD5000 + 8: BBC + + 参数: + d: 单词到考试类型列表的映射 + + 返回: + 单词到难度级别的映射 + """ + result = {} + for word, test_types in d.items(): + word_lower = word.lower() # 统一小写处理 + if 'CET4' in test_types: + result[word_lower] = 4 + elif 'OXFORD3000' in test_types: + result[word_lower] = 5 + elif 'CET6' in test_types or 'GRADUATE' in test_types: + result[word_lower] = 6 + elif 'IELTS' in test_types or 'OXFORD5000' in test_types: + result[word_lower] = 7 + elif 'BBC' in test_types: + result[word_lower] = 8 + else: + result[word_lower] = 0 + return result @classmethod def get_word_level(cls, word: str) -> int: @@ -102,27 +103,32 @@ class VocabularyLevelEstimator: 单词的难度级别(0-8) """ cls._load_data() - return cls._difficulty_dict.get(word, 0) + return cls._difficulty_dict.get(word.lower(), 0) + + @classmethod + def reload_data(cls, new_path=None): + """强制重新加载词汇数据""" + if new_path: + cls.PICKLE_PATH = new_path + cls._test_raw = None + cls._difficulty_dict = None + cls._load_data() class UserVocabularyLevel(VocabularyLevelEstimator): - """ - 用户词汇水平评估 - 根据用户最近查询的单词评估其词汇水平 - """ + """用户词汇水平评估""" - def __init__(self, d: Dict[str, List[int]]): + def __init__(self, user_data: Dict[str, List[int]]): """ 初始化用户词汇数据 参数: - d: 单词到时间戳列表的映射 + user_data: 单词到时间戳列表的映射 """ - self.d = d # 获取每个单词的最新查询时间并排序 - word_time = [(word, max(times)) for word, times in d.items() if times] + word_time = [(word, max(times)) for word, times in user_data.items() if times] sorted_words = sorted(word_time, key=lambda x: x[1], reverse=True) - self.recent_words = [word for word, _ in sorted_words[:3]] + self.recent_words = [word for word, _ in sorted_words[:3]] # 取最近3个单词 @property def level(self) -> float: @@ -139,10 +145,7 @@ class UserVocabularyLevel(VocabularyLevelEstimator): class ArticleVocabularyLevel(VocabularyLevelEstimator): - """ - 文章词汇水平评估 - 根据文章中出现的最高难度单词评估文章词汇水平 - """ + """文章词汇水平评估""" def __init__(self, content: str): """ @@ -150,19 +153,19 @@ class ArticleVocabularyLevel(VocabularyLevelEstimator): 参数: content: 文章内容字符串 - - 异常: - ValueError: 当内容为空或不是字符串时抛出 """ if not content or not isinstance(content, str): - raise ValueError("文章内容必须是非空字符串") + self.top_levels = [] + return - self.content = content - # 提取所有单词并计算难度 + # 文本预处理:转换为小写并提取单词 words = WORD_PATTERN.findall(content.lower()) + + # 计算单词难度并筛选有效值 word_levels = [self.get_word_level(word) for word in words] - # 筛选有效难度并排序 valid_levels = sorted([lvl for lvl in word_levels if lvl > 0], reverse=True) + + # 取难度最高的5个单词 self.top_levels = valid_levels[:5] if valid_levels else [] @property @@ -176,4 +179,18 @@ class ArticleVocabularyLevel(VocabularyLevelEstimator): """ if not self.top_levels: return 0 - return sum(self.top_levels) / len(self.top_levels) \ No newline at end of file + return sum(self.top_levels) / len(self.top_levels) + + def word_frequency(self, top_n=10) -> Dict[str, int]: + """ + 获取文章词频统计 + + 参数: + top_n: 返回的最高频单词数量 + + 返回: + 词频最高的top_n个单词及其频率 + """ + words = WORD_PATTERN.findall(self.content.lower()) + word_freq = Counter(words) + return dict(word_freq.most_common(top_n))