From 4a5fc9a7ce6c6d5c6fde6ffea813be9db70e96b5 Mon Sep 17 00:00:00 2001
From: AsWhitale <1838528479@qq.com>
Date: Fri, 6 Jun 2025 21:00:29 +0800
Subject: [PATCH] =?UTF-8?q?code=20review=201.=E8=B0=83=E6=95=B4=E4=BA=86?=
 =?UTF-8?q?=E4=B8=80=E4=BA=9B=E6=96=B9=E6=B3=95=E5=92=8C=E7=B1=BB=E7=9A=84?=
 =?UTF-8?q?=E5=85=B3=E7=B3=BB=202.=E4=BC=98=E5=8C=96=E4=BA=86=E6=95=B0?=
 =?UTF-8?q?=E6=8D=AE=E5=8A=A0=E8=BD=BD=203.=E4=BF=AE=E6=AD=A3=E4=BA=86?=
 =?UTF-8?q?=E9=94=99=E8=AF=AF=E7=9A=84=E6=96=87=E4=BB=B6=E8=B7=AF=E5=BE=84?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 app/vocabulary.py | 195 +++++++++++++++++++++++++---------------------
 1 file changed, 106 insertions(+), 89 deletions(-)

diff --git a/app/vocabulary.py b/app/vocabulary.py
index c8f3cf2..029d905 100644
--- a/app/vocabulary.py
+++ b/app/vocabulary.py
@@ -1,76 +1,13 @@
-"""
-词汇难度评估系统
-功能:根据单词在不同考试中的出现情况评估其难度级别,并计算用户或文章的词汇水平
-"""
-
 import re
 import pickle
+import os
 from typing import Dict, List, Tuple, Union
+from collections import Counter
 
 # 预编译正则表达式提高性能
 WORD_PATTERN = re.compile(r'\b[\w-]+\b')
 
 
-def load_record(pickle_fname: str) -> Dict[str, List[str]]:
-    """
-    加载pickle格式的单词-考试类型数据
-
-    参数:
-    pickle_fname: pickle文件名
-
-    返回:
-    字典格式的单词到考试类型列表的映射
-
-    异常:
-    FileNotFoundError: 当文件不存在时抛出
-    ValueError: 当pickle文件损坏时抛出
-    """
-    try:
-        with open(pickle_fname, 'rb') as f:
-            return pickle.load(f)
-    except FileNotFoundError:
-        raise FileNotFoundError(f"Pickle文件 {pickle_fname} 未找到")
-    except pickle.PickleError:
-        raise ValueError(f"Pickle文件 {pickle_fname} 损坏或格式不正确")
-
-
-def convert_test_type_to_difficulty_level(d: Dict[str, List[str]]) -> Dict[str, int]:
-    """
-    将考试类型映射为难度级别
-
-    难度级别定义:
-    0: 未知/未分类
-    4: CET4
-    5: OXFORD3000
-    6: CET6或GRADUATE
-    7: IELTS或OXFORD5000
-    8: BBC
-
-    参数:
-    d: 单词到考试类型列表的映射
-
-    返回:
-    单词到难度级别的映射
-    """
-    result = {}
-    for word, test_types in d.items():
-        if 'CET4' in test_types:
-            result[word] = 4
-        elif 'OXFORD3000' in test_types:
-            result[word] = 5
-        elif 'CET6' in test_types or 'GRADUATE' in test_types:
-            result[word] = 6
-        elif 'IELTS' in test_types:
-            result[word] = 7
-        elif 'OXFORD5000' in test_types:
-            result[word] = 7
-        elif 'BBC' in test_types:
-            result[word] = 8
-        else:
-            result[word] = 0
-    return result
-
-
 class VocabularyLevelEstimator:
     """
     词汇难度评估基类
@@ -82,13 +19,77 @@ class VocabularyLevelEstimator:
     """
     _test_raw = None
     _difficulty_dict = None
+    PICKLE_PATH = 'static/words_and_tests.p'  # 默认数据文件路径
 
     @classmethod
     def _load_data(cls):
         """延迟加载数据,避免不必要的文件操作"""
         if cls._test_raw is None:
-            cls._test_raw = load_record('words_and_tests.p')
-            cls._difficulty_dict = convert_test_type_to_difficulty_level(cls._test_raw)
+            cls._test_raw = cls.load_record(cls.PICKLE_PATH)
+            cls._difficulty_dict = cls.convert_test_type_to_difficulty_level(cls._test_raw)
+
+    @staticmethod
+    def load_record(pickle_fname: str) -> Dict[str, List[str]]:
+        """
+        加载pickle格式的单词-考试类型数据
+
+        参数:
+        pickle_fname: pickle文件名
+
+        返回:
+        字典格式的单词到考试类型列表的映射
+
+        异常:
+        FileNotFoundError: 当文件不存在时抛出
+        ValueError: 当pickle文件损坏时抛出
+        """
+        try:
+            # 文件校验
+            if not os.path.exists(pickle_fname):
+                raise FileNotFoundError(f"词汇数据文件 {pickle_fname} 未找到")
+            if not pickle_fname.endswith('.p'):
+                raise ValueError("仅支持.pickle格式文件")
+
+            with open(pickle_fname, 'rb') as f:
+                return pickle.load(f)
+        except pickle.PickleError as e:
+            raise ValueError(f"Pickle文件 {pickle_fname} 损坏: {str(e)}")
+
+    @staticmethod
+    def convert_test_type_to_difficulty_level(d: Dict[str, List[str]]) -> Dict[str, int]:
+        """
+        将考试类型映射为难度级别
+
+        难度级别定义:
+        0: 未知/未分类
+        4: CET4
+        5: OXFORD3000
+        6: CET6或GRADUATE
+        7: IELTS或OXFORD5000
+        8: BBC
+
+        参数:
+        d: 单词到考试类型列表的映射
+
+        返回:
+        单词到难度级别的映射
+        """
+        result = {}
+        for word, test_types in d.items():
+            word_lower = word.lower()  # 统一小写处理
+            if 'CET4' in test_types:
+                result[word_lower] = 4
+            elif 'OXFORD3000' in test_types:
+                result[word_lower] = 5
+            elif 'CET6' in test_types or 'GRADUATE' in test_types:
+                result[word_lower] = 6
+            elif 'IELTS' in test_types or 'OXFORD5000' in test_types:
+                result[word_lower] = 7
+            elif 'BBC' in test_types:
+                result[word_lower] = 8
+            else:
+                result[word_lower] = 0
+        return result
 
     @classmethod
     def get_word_level(cls, word: str) -> int:
@@ -102,27 +103,32 @@ class VocabularyLevelEstimator:
         单词的难度级别(0-8)
         """
         cls._load_data()
-        return cls._difficulty_dict.get(word, 0)
+        return cls._difficulty_dict.get(word.lower(), 0)
+
+    @classmethod
+    def reload_data(cls, new_path=None):
+        """强制重新加载词汇数据"""
+        if new_path:
+            cls.PICKLE_PATH = new_path
+        cls._test_raw = None
+        cls._difficulty_dict = None
+        cls._load_data()
 
 
 class UserVocabularyLevel(VocabularyLevelEstimator):
-    """
-    用户词汇水平评估
-    根据用户最近查询的单词评估其词汇水平
-    """
+    """用户词汇水平评估"""
 
-    def __init__(self, d: Dict[str, List[int]]):
+    def __init__(self, user_data: Dict[str, List[int]]):
         """
         初始化用户词汇数据
 
         参数:
-        d: 单词到时间戳列表的映射
+        user_data: 单词到时间戳列表的映射
         """
-        self.d = d
         # 获取每个单词的最新查询时间并排序
-        word_time = [(word, max(times)) for word, times in d.items() if times]
+        word_time = [(word, max(times)) for word, times in user_data.items() if times]
         sorted_words = sorted(word_time, key=lambda x: x[1], reverse=True)
-        self.recent_words = [word for word, _ in sorted_words[:3]]
+        self.recent_words = [word for word, _ in sorted_words[:3]]  # 取最近3个单词
 
     @property
     def level(self) -> float:
@@ -139,10 +145,7 @@ class UserVocabularyLevel(VocabularyLevelEstimator):
 
 
 class ArticleVocabularyLevel(VocabularyLevelEstimator):
-    """
-    文章词汇水平评估
-    根据文章中出现的最高难度单词评估文章词汇水平
-    """
+    """文章词汇水平评估"""
 
     def __init__(self, content: str):
         """
@@ -150,19 +153,19 @@ class ArticleVocabularyLevel(VocabularyLevelEstimator):
 
         参数:
         content: 文章内容字符串
-
-        异常:
-        ValueError: 当内容为空或不是字符串时抛出
         """
         if not content or not isinstance(content, str):
-            raise ValueError("文章内容必须是非空字符串")
+            self.top_levels = []
+            return
 
-        self.content = content
-        # 提取所有单词并计算难度
+        # 文本预处理:转换为小写并提取单词
         words = WORD_PATTERN.findall(content.lower())
+
+        # 计算单词难度并筛选有效值
         word_levels = [self.get_word_level(word) for word in words]
-        # 筛选有效难度并排序
         valid_levels = sorted([lvl for lvl in word_levels if lvl > 0], reverse=True)
+
+        # 取难度最高的5个单词
         self.top_levels = valid_levels[:5] if valid_levels else []
 
     @property
@@ -176,4 +179,18 @@ class ArticleVocabularyLevel(VocabularyLevelEstimator):
         """
         if not self.top_levels:
             return 0
-        return sum(self.top_levels) / len(self.top_levels)
\ No newline at end of file
+        return sum(self.top_levels) / len(self.top_levels)
+
+    def word_frequency(self, top_n=10) -> Dict[str, int]:
+        """
+        获取文章词频统计
+
+        参数:
+        top_n: 返回的最高频单词数量
+
+        返回:
+        词频最高的top_n个单词及其频率
+        """
+        words = WORD_PATTERN.findall(self.content.lower())
+        word_freq = Counter(words)
+        return dict(word_freq.most_common(top_n))