code review

1.调整了一些方法和类的关系
2.优化了数据加载
3.修正了错误的文件路径
pull/197/head
王希涛 2025-06-06 21:00:29 +08:00
parent 52a9ca9677
commit 4a5fc9a7ce
1 changed files with 106 additions and 89 deletions

View File

@ -1,16 +1,34 @@
"""
词汇难度评估系统
功能根据单词在不同考试中的出现情况评估其难度级别并计算用户或文章的词汇水平
"""
import re import re
import pickle import pickle
import os
from typing import Dict, List, Tuple, Union from typing import Dict, List, Tuple, Union
from collections import Counter
# 预编译正则表达式提高性能 # 预编译正则表达式提高性能
WORD_PATTERN = re.compile(r'\b[\w-]+\b') WORD_PATTERN = re.compile(r'\b[\w-]+\b')
class VocabularyLevelEstimator:
"""
词汇难度评估基类
使用预定义的单词-考试类型数据评估单词难度级别
类属性:
_test_raw: 原始单词-考试类型数据
_difficulty_dict: 转换后的单词-难度级别映射
"""
_test_raw = None
_difficulty_dict = None
PICKLE_PATH = 'static/words_and_tests.p' # 默认数据文件路径
@classmethod
def _load_data(cls):
"""延迟加载数据,避免不必要的文件操作"""
if cls._test_raw is None:
cls._test_raw = cls.load_record(cls.PICKLE_PATH)
cls._difficulty_dict = cls.convert_test_type_to_difficulty_level(cls._test_raw)
@staticmethod
def load_record(pickle_fname: str) -> Dict[str, List[str]]: def load_record(pickle_fname: str) -> Dict[str, List[str]]:
""" """
加载pickle格式的单词-考试类型数据 加载pickle格式的单词-考试类型数据
@ -26,14 +44,18 @@ def load_record(pickle_fname: str) -> Dict[str, List[str]]:
ValueError: 当pickle文件损坏时抛出 ValueError: 当pickle文件损坏时抛出
""" """
try: try:
# 文件校验
if not os.path.exists(pickle_fname):
raise FileNotFoundError(f"词汇数据文件 {pickle_fname} 未找到")
if not pickle_fname.endswith('.p'):
raise ValueError("仅支持.pickle格式文件")
with open(pickle_fname, 'rb') as f: with open(pickle_fname, 'rb') as f:
return pickle.load(f) return pickle.load(f)
except FileNotFoundError: except pickle.PickleError as e:
raise FileNotFoundError(f"Pickle文件 {pickle_fname} 未找到") raise ValueError(f"Pickle文件 {pickle_fname} 损坏: {str(e)}")
except pickle.PickleError:
raise ValueError(f"Pickle文件 {pickle_fname} 损坏或格式不正确")
@staticmethod
def convert_test_type_to_difficulty_level(d: Dict[str, List[str]]) -> Dict[str, int]: def convert_test_type_to_difficulty_level(d: Dict[str, List[str]]) -> Dict[str, int]:
""" """
将考试类型映射为难度级别 将考试类型映射为难度级别
@ -54,42 +76,21 @@ def convert_test_type_to_difficulty_level(d: Dict[str, List[str]]) -> Dict[str,
""" """
result = {} result = {}
for word, test_types in d.items(): for word, test_types in d.items():
word_lower = word.lower() # 统一小写处理
if 'CET4' in test_types: if 'CET4' in test_types:
result[word] = 4 result[word_lower] = 4
elif 'OXFORD3000' in test_types: elif 'OXFORD3000' in test_types:
result[word] = 5 result[word_lower] = 5
elif 'CET6' in test_types or 'GRADUATE' in test_types: elif 'CET6' in test_types or 'GRADUATE' in test_types:
result[word] = 6 result[word_lower] = 6
elif 'IELTS' in test_types: elif 'IELTS' in test_types or 'OXFORD5000' in test_types:
result[word] = 7 result[word_lower] = 7
elif 'OXFORD5000' in test_types:
result[word] = 7
elif 'BBC' in test_types: elif 'BBC' in test_types:
result[word] = 8 result[word_lower] = 8
else: else:
result[word] = 0 result[word_lower] = 0
return result return result
class VocabularyLevelEstimator:
"""
词汇难度评估基类
使用预定义的单词-考试类型数据评估单词难度级别
类属性:
_test_raw: 原始单词-考试类型数据
_difficulty_dict: 转换后的单词-难度级别映射
"""
_test_raw = None
_difficulty_dict = None
@classmethod
def _load_data(cls):
"""延迟加载数据,避免不必要的文件操作"""
if cls._test_raw is None:
cls._test_raw = load_record('words_and_tests.p')
cls._difficulty_dict = convert_test_type_to_difficulty_level(cls._test_raw)
@classmethod @classmethod
def get_word_level(cls, word: str) -> int: def get_word_level(cls, word: str) -> int:
""" """
@ -102,27 +103,32 @@ class VocabularyLevelEstimator:
单词的难度级别(0-8) 单词的难度级别(0-8)
""" """
cls._load_data() cls._load_data()
return cls._difficulty_dict.get(word, 0) return cls._difficulty_dict.get(word.lower(), 0)
@classmethod
def reload_data(cls, new_path=None):
"""强制重新加载词汇数据"""
if new_path:
cls.PICKLE_PATH = new_path
cls._test_raw = None
cls._difficulty_dict = None
cls._load_data()
class UserVocabularyLevel(VocabularyLevelEstimator): class UserVocabularyLevel(VocabularyLevelEstimator):
""" """用户词汇水平评估"""
用户词汇水平评估
根据用户最近查询的单词评估其词汇水平
"""
def __init__(self, d: Dict[str, List[int]]): def __init__(self, user_data: Dict[str, List[int]]):
""" """
初始化用户词汇数据 初始化用户词汇数据
参数: 参数:
d: 单词到时间戳列表的映射 user_data: 单词到时间戳列表的映射
""" """
self.d = d
# 获取每个单词的最新查询时间并排序 # 获取每个单词的最新查询时间并排序
word_time = [(word, max(times)) for word, times in d.items() if times] word_time = [(word, max(times)) for word, times in user_data.items() if times]
sorted_words = sorted(word_time, key=lambda x: x[1], reverse=True) sorted_words = sorted(word_time, key=lambda x: x[1], reverse=True)
self.recent_words = [word for word, _ in sorted_words[:3]] self.recent_words = [word for word, _ in sorted_words[:3]] # 取最近3个单词
@property @property
def level(self) -> float: def level(self) -> float:
@ -139,10 +145,7 @@ class UserVocabularyLevel(VocabularyLevelEstimator):
class ArticleVocabularyLevel(VocabularyLevelEstimator): class ArticleVocabularyLevel(VocabularyLevelEstimator):
""" """文章词汇水平评估"""
文章词汇水平评估
根据文章中出现的最高难度单词评估文章词汇水平
"""
def __init__(self, content: str): def __init__(self, content: str):
""" """
@ -150,19 +153,19 @@ class ArticleVocabularyLevel(VocabularyLevelEstimator):
参数: 参数:
content: 文章内容字符串 content: 文章内容字符串
异常:
ValueError: 当内容为空或不是字符串时抛出
""" """
if not content or not isinstance(content, str): if not content or not isinstance(content, str):
raise ValueError("文章内容必须是非空字符串") self.top_levels = []
return
self.content = content # 文本预处理:转换为小写并提取单词
# 提取所有单词并计算难度
words = WORD_PATTERN.findall(content.lower()) words = WORD_PATTERN.findall(content.lower())
# 计算单词难度并筛选有效值
word_levels = [self.get_word_level(word) for word in words] word_levels = [self.get_word_level(word) for word in words]
# 筛选有效难度并排序
valid_levels = sorted([lvl for lvl in word_levels if lvl > 0], reverse=True) valid_levels = sorted([lvl for lvl in word_levels if lvl > 0], reverse=True)
# 取难度最高的5个单词
self.top_levels = valid_levels[:5] if valid_levels else [] self.top_levels = valid_levels[:5] if valid_levels else []
@property @property
@ -177,3 +180,17 @@ class ArticleVocabularyLevel(VocabularyLevelEstimator):
if not self.top_levels: if not self.top_levels:
return 0 return 0
return sum(self.top_levels) / len(self.top_levels) return sum(self.top_levels) / len(self.top_levels)
def word_frequency(self, top_n=10) -> Dict[str, int]:
"""
获取文章词频统计
参数:
top_n: 返回的最高频单词数量
返回:
词频最高的top_n个单词及其频率
"""
words = WORD_PATTERN.findall(self.content.lower())
word_freq = Counter(words)
return dict(word_freq.most_common(top_n))