Bug585-Wangxitao #197

Open
wangxitao wants to merge 4 commits from wangxitao/EnglishPal:Bug585-Wangxitao into Alpha-snapshot20240618
2 changed files with 196 additions and 0 deletions

BIN
app/static/wordfreqapp.db Normal file

Binary file not shown.

196
app/vocabulary.py Normal file
View File

@ -0,0 +1,196 @@
import re
import pickle
import os
from typing import Dict, List, Tuple, Union
from collections import Counter
# 预编译正则表达式提高性能
WORD_PATTERN = re.compile(r'\b[\w-]+\b')
class VocabularyLevelEstimator:
"""
词汇难度评估基类
使用预定义的单词-考试类型数据评估单词难度级别
类属性:
_test_raw: 原始单词-考试类型数据
_difficulty_dict: 转换后的单词-难度级别映射
"""
_test_raw = None
_difficulty_dict = None
PICKLE_PATH = 'static/words_and_tests.p' # 默认数据文件路径
@classmethod
def _load_data(cls):
"""延迟加载数据,避免不必要的文件操作"""
if cls._test_raw is None:
cls._test_raw = cls.load_record(cls.PICKLE_PATH)
cls._difficulty_dict = cls.convert_test_type_to_difficulty_level(cls._test_raw)
@staticmethod
def load_record(pickle_fname: str) -> Dict[str, List[str]]:
"""
加载pickle格式的单词-考试类型数据
参数:
pickle_fname: pickle文件名
返回:
字典格式的单词到考试类型列表的映射
异常:
FileNotFoundError: 当文件不存在时抛出
ValueError: 当pickle文件损坏时抛出
"""
try:
# 文件校验
if not os.path.exists(pickle_fname):
raise FileNotFoundError(f"词汇数据文件 {pickle_fname} 未找到")
if not pickle_fname.endswith('.p'):
raise ValueError("仅支持.pickle格式文件")
with open(pickle_fname, 'rb') as f:
return pickle.load(f)
except pickle.PickleError as e:
raise ValueError(f"Pickle文件 {pickle_fname} 损坏: {str(e)}")
@staticmethod
def convert_test_type_to_difficulty_level(d: Dict[str, List[str]]) -> Dict[str, int]:
"""
将考试类型映射为难度级别
难度级别定义:
0: 未知/未分类
4: CET4
5: OXFORD3000
6: CET6或GRADUATE
7: IELTS或OXFORD5000
8: BBC
参数:
d: 单词到考试类型列表的映射
返回:
单词到难度级别的映射
"""
result = {}
for word, test_types in d.items():
word_lower = word.lower() # 统一小写处理
if 'CET4' in test_types:
result[word_lower] = 4
elif 'OXFORD3000' in test_types:
result[word_lower] = 5
elif 'CET6' in test_types or 'GRADUATE' in test_types:
result[word_lower] = 6
elif 'IELTS' in test_types or 'OXFORD5000' in test_types:
result[word_lower] = 7
elif 'BBC' in test_types:
result[word_lower] = 8
else:
result[word_lower] = 0
return result
@classmethod
def get_word_level(cls, word: str) -> int:
"""
获取单词难度级别
参数:
word: 要查询的单词
返回:
单词的难度级别(0-8)
"""
cls._load_data()
return cls._difficulty_dict.get(word.lower(), 0)
@classmethod
def reload_data(cls, new_path=None):
"""强制重新加载词汇数据"""
if new_path:
cls.PICKLE_PATH = new_path
cls._test_raw = None
cls._difficulty_dict = None
cls._load_data()
class UserVocabularyLevel(VocabularyLevelEstimator):
"""用户词汇水平评估"""
def __init__(self, user_data: Dict[str, List[int]]):
"""
初始化用户词汇数据
参数:
user_data: 单词到时间戳列表的映射
"""
# 获取每个单词的最新查询时间并排序
word_time = [(word, max(times)) for word, times in user_data.items() if times]
sorted_words = sorted(word_time, key=lambda x: x[1], reverse=True)
self.recent_words = [word for word, _ in sorted_words[:3]] # 取最近3个单词
@property
def level(self) -> float:
"""
计算用户词汇水平
返回:
最近查询的有效单词的平均难度级别
如果没有有效单词则返回0
"""
levels = [self.get_word_level(word) for word in self.recent_words]
valid_levels = [lvl for lvl in levels if lvl > 0]
return sum(valid_levels) / len(valid_levels) if valid_levels else 0
class ArticleVocabularyLevel(VocabularyLevelEstimator):
"""文章词汇水平评估"""
def __init__(self, content: str):
"""
初始化文章内容
参数:
content: 文章内容字符串
"""
if not content or not isinstance(content, str):
self.top_levels = []
return
# 文本预处理:转换为小写并提取单词
words = WORD_PATTERN.findall(content.lower())
# 计算单词难度并筛选有效值
word_levels = [self.get_word_level(word) for word in words]
valid_levels = sorted([lvl for lvl in word_levels if lvl > 0], reverse=True)
# 取难度最高的5个单词
self.top_levels = valid_levels[:5] if valid_levels else []
@property
def level(self) -> float:
"""
计算文章词汇水平
返回:
文章中最难5个单词的平均难度级别
如果没有有效单词则返回0
"""
if not self.top_levels:
return 0
return sum(self.top_levels) / len(self.top_levels)
def word_frequency(self, top_n=10) -> Dict[str, int]:
"""
获取文章词频统计
参数:
top_n: 返回的最高频单词数量
返回:
词频最高的top_n个单词及其频率
"""
words = WORD_PATTERN.findall(self.content.lower())
word_freq = Counter(words)
return dict(word_freq.most_common(top_n))