code review

1.调整了一些方法和类的关系
2.优化了数据加载
3.修正了错误的文件路径
pull/197/head
王希涛 2025-06-06 21:00:29 +08:00
parent 52a9ca9677
commit 4a5fc9a7ce
1 changed files with 106 additions and 89 deletions

View File

@ -1,76 +1,13 @@
"""
词汇难度评估系统
功能根据单词在不同考试中的出现情况评估其难度级别并计算用户或文章的词汇水平
"""
import re import re
import pickle import pickle
import os
from typing import Dict, List, Tuple, Union from typing import Dict, List, Tuple, Union
from collections import Counter
# 预编译正则表达式提高性能 # 预编译正则表达式提高性能
WORD_PATTERN = re.compile(r'\b[\w-]+\b') WORD_PATTERN = re.compile(r'\b[\w-]+\b')
def load_record(pickle_fname: str) -> Dict[str, List[str]]:
"""
加载pickle格式的单词-考试类型数据
参数:
pickle_fname: pickle文件名
返回:
字典格式的单词到考试类型列表的映射
异常:
FileNotFoundError: 当文件不存在时抛出
ValueError: 当pickle文件损坏时抛出
"""
try:
with open(pickle_fname, 'rb') as f:
return pickle.load(f)
except FileNotFoundError:
raise FileNotFoundError(f"Pickle文件 {pickle_fname} 未找到")
except pickle.PickleError:
raise ValueError(f"Pickle文件 {pickle_fname} 损坏或格式不正确")
def convert_test_type_to_difficulty_level(d: Dict[str, List[str]]) -> Dict[str, int]:
"""
将考试类型映射为难度级别
难度级别定义:
0: 未知/未分类
4: CET4
5: OXFORD3000
6: CET6或GRADUATE
7: IELTS或OXFORD5000
8: BBC
参数:
d: 单词到考试类型列表的映射
返回:
单词到难度级别的映射
"""
result = {}
for word, test_types in d.items():
if 'CET4' in test_types:
result[word] = 4
elif 'OXFORD3000' in test_types:
result[word] = 5
elif 'CET6' in test_types or 'GRADUATE' in test_types:
result[word] = 6
elif 'IELTS' in test_types:
result[word] = 7
elif 'OXFORD5000' in test_types:
result[word] = 7
elif 'BBC' in test_types:
result[word] = 8
else:
result[word] = 0
return result
class VocabularyLevelEstimator: class VocabularyLevelEstimator:
""" """
词汇难度评估基类 词汇难度评估基类
@ -82,13 +19,77 @@ class VocabularyLevelEstimator:
""" """
_test_raw = None _test_raw = None
_difficulty_dict = None _difficulty_dict = None
PICKLE_PATH = 'static/words_and_tests.p' # 默认数据文件路径
@classmethod @classmethod
def _load_data(cls): def _load_data(cls):
"""延迟加载数据,避免不必要的文件操作""" """延迟加载数据,避免不必要的文件操作"""
if cls._test_raw is None: if cls._test_raw is None:
cls._test_raw = load_record('words_and_tests.p') cls._test_raw = cls.load_record(cls.PICKLE_PATH)
cls._difficulty_dict = convert_test_type_to_difficulty_level(cls._test_raw) cls._difficulty_dict = cls.convert_test_type_to_difficulty_level(cls._test_raw)
@staticmethod
def load_record(pickle_fname: str) -> Dict[str, List[str]]:
"""
加载pickle格式的单词-考试类型数据
参数:
pickle_fname: pickle文件名
返回:
字典格式的单词到考试类型列表的映射
异常:
FileNotFoundError: 当文件不存在时抛出
ValueError: 当pickle文件损坏时抛出
"""
try:
# 文件校验
if not os.path.exists(pickle_fname):
raise FileNotFoundError(f"词汇数据文件 {pickle_fname} 未找到")
if not pickle_fname.endswith('.p'):
raise ValueError("仅支持.pickle格式文件")
with open(pickle_fname, 'rb') as f:
return pickle.load(f)
except pickle.PickleError as e:
raise ValueError(f"Pickle文件 {pickle_fname} 损坏: {str(e)}")
@staticmethod
def convert_test_type_to_difficulty_level(d: Dict[str, List[str]]) -> Dict[str, int]:
"""
将考试类型映射为难度级别
难度级别定义:
0: 未知/未分类
4: CET4
5: OXFORD3000
6: CET6或GRADUATE
7: IELTS或OXFORD5000
8: BBC
参数:
d: 单词到考试类型列表的映射
返回:
单词到难度级别的映射
"""
result = {}
for word, test_types in d.items():
word_lower = word.lower() # 统一小写处理
if 'CET4' in test_types:
result[word_lower] = 4
elif 'OXFORD3000' in test_types:
result[word_lower] = 5
elif 'CET6' in test_types or 'GRADUATE' in test_types:
result[word_lower] = 6
elif 'IELTS' in test_types or 'OXFORD5000' in test_types:
result[word_lower] = 7
elif 'BBC' in test_types:
result[word_lower] = 8
else:
result[word_lower] = 0
return result
@classmethod @classmethod
def get_word_level(cls, word: str) -> int: def get_word_level(cls, word: str) -> int:
@ -102,27 +103,32 @@ class VocabularyLevelEstimator:
单词的难度级别(0-8) 单词的难度级别(0-8)
""" """
cls._load_data() cls._load_data()
return cls._difficulty_dict.get(word, 0) return cls._difficulty_dict.get(word.lower(), 0)
@classmethod
def reload_data(cls, new_path=None):
"""强制重新加载词汇数据"""
if new_path:
cls.PICKLE_PATH = new_path
cls._test_raw = None
cls._difficulty_dict = None
cls._load_data()
class UserVocabularyLevel(VocabularyLevelEstimator): class UserVocabularyLevel(VocabularyLevelEstimator):
""" """用户词汇水平评估"""
用户词汇水平评估
根据用户最近查询的单词评估其词汇水平
"""
def __init__(self, d: Dict[str, List[int]]): def __init__(self, user_data: Dict[str, List[int]]):
""" """
初始化用户词汇数据 初始化用户词汇数据
参数: 参数:
d: 单词到时间戳列表的映射 user_data: 单词到时间戳列表的映射
""" """
self.d = d
# 获取每个单词的最新查询时间并排序 # 获取每个单词的最新查询时间并排序
word_time = [(word, max(times)) for word, times in d.items() if times] word_time = [(word, max(times)) for word, times in user_data.items() if times]
sorted_words = sorted(word_time, key=lambda x: x[1], reverse=True) sorted_words = sorted(word_time, key=lambda x: x[1], reverse=True)
self.recent_words = [word for word, _ in sorted_words[:3]] self.recent_words = [word for word, _ in sorted_words[:3]] # 取最近3个单词
@property @property
def level(self) -> float: def level(self) -> float:
@ -139,10 +145,7 @@ class UserVocabularyLevel(VocabularyLevelEstimator):
class ArticleVocabularyLevel(VocabularyLevelEstimator): class ArticleVocabularyLevel(VocabularyLevelEstimator):
""" """文章词汇水平评估"""
文章词汇水平评估
根据文章中出现的最高难度单词评估文章词汇水平
"""
def __init__(self, content: str): def __init__(self, content: str):
""" """
@ -150,19 +153,19 @@ class ArticleVocabularyLevel(VocabularyLevelEstimator):
参数: 参数:
content: 文章内容字符串 content: 文章内容字符串
异常:
ValueError: 当内容为空或不是字符串时抛出
""" """
if not content or not isinstance(content, str): if not content or not isinstance(content, str):
raise ValueError("文章内容必须是非空字符串") self.top_levels = []
return
self.content = content # 文本预处理:转换为小写并提取单词
# 提取所有单词并计算难度
words = WORD_PATTERN.findall(content.lower()) words = WORD_PATTERN.findall(content.lower())
# 计算单词难度并筛选有效值
word_levels = [self.get_word_level(word) for word in words] word_levels = [self.get_word_level(word) for word in words]
# 筛选有效难度并排序
valid_levels = sorted([lvl for lvl in word_levels if lvl > 0], reverse=True) valid_levels = sorted([lvl for lvl in word_levels if lvl > 0], reverse=True)
# 取难度最高的5个单词
self.top_levels = valid_levels[:5] if valid_levels else [] self.top_levels = valid_levels[:5] if valid_levels else []
@property @property
@ -176,4 +179,18 @@ class ArticleVocabularyLevel(VocabularyLevelEstimator):
""" """
if not self.top_levels: if not self.top_levels:
return 0 return 0
return sum(self.top_levels) / len(self.top_levels) return sum(self.top_levels) / len(self.top_levels)
def word_frequency(self, top_n=10) -> Dict[str, int]:
"""
获取文章词频统计
参数:
top_n: 返回的最高频单词数量
返回:
词频最高的top_n个单词及其频率
"""
words = WORD_PATTERN.findall(self.content.lower())
word_freq = Counter(words)
return dict(word_freq.most_common(top_n))