parent
52a9ca9677
commit
4a5fc9a7ce
|
@ -1,76 +1,13 @@
|
||||||
"""
|
|
||||||
词汇难度评估系统
|
|
||||||
功能:根据单词在不同考试中的出现情况评估其难度级别,并计算用户或文章的词汇水平
|
|
||||||
"""
|
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import pickle
|
import pickle
|
||||||
|
import os
|
||||||
from typing import Dict, List, Tuple, Union
|
from typing import Dict, List, Tuple, Union
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
# 预编译正则表达式提高性能
|
# 预编译正则表达式提高性能
|
||||||
WORD_PATTERN = re.compile(r'\b[\w-]+\b')
|
WORD_PATTERN = re.compile(r'\b[\w-]+\b')
|
||||||
|
|
||||||
|
|
||||||
def load_record(pickle_fname: str) -> Dict[str, List[str]]:
|
|
||||||
"""
|
|
||||||
加载pickle格式的单词-考试类型数据
|
|
||||||
|
|
||||||
参数:
|
|
||||||
pickle_fname: pickle文件名
|
|
||||||
|
|
||||||
返回:
|
|
||||||
字典格式的单词到考试类型列表的映射
|
|
||||||
|
|
||||||
异常:
|
|
||||||
FileNotFoundError: 当文件不存在时抛出
|
|
||||||
ValueError: 当pickle文件损坏时抛出
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
with open(pickle_fname, 'rb') as f:
|
|
||||||
return pickle.load(f)
|
|
||||||
except FileNotFoundError:
|
|
||||||
raise FileNotFoundError(f"Pickle文件 {pickle_fname} 未找到")
|
|
||||||
except pickle.PickleError:
|
|
||||||
raise ValueError(f"Pickle文件 {pickle_fname} 损坏或格式不正确")
|
|
||||||
|
|
||||||
|
|
||||||
def convert_test_type_to_difficulty_level(d: Dict[str, List[str]]) -> Dict[str, int]:
|
|
||||||
"""
|
|
||||||
将考试类型映射为难度级别
|
|
||||||
|
|
||||||
难度级别定义:
|
|
||||||
0: 未知/未分类
|
|
||||||
4: CET4
|
|
||||||
5: OXFORD3000
|
|
||||||
6: CET6或GRADUATE
|
|
||||||
7: IELTS或OXFORD5000
|
|
||||||
8: BBC
|
|
||||||
|
|
||||||
参数:
|
|
||||||
d: 单词到考试类型列表的映射
|
|
||||||
|
|
||||||
返回:
|
|
||||||
单词到难度级别的映射
|
|
||||||
"""
|
|
||||||
result = {}
|
|
||||||
for word, test_types in d.items():
|
|
||||||
if 'CET4' in test_types:
|
|
||||||
result[word] = 4
|
|
||||||
elif 'OXFORD3000' in test_types:
|
|
||||||
result[word] = 5
|
|
||||||
elif 'CET6' in test_types or 'GRADUATE' in test_types:
|
|
||||||
result[word] = 6
|
|
||||||
elif 'IELTS' in test_types:
|
|
||||||
result[word] = 7
|
|
||||||
elif 'OXFORD5000' in test_types:
|
|
||||||
result[word] = 7
|
|
||||||
elif 'BBC' in test_types:
|
|
||||||
result[word] = 8
|
|
||||||
else:
|
|
||||||
result[word] = 0
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
class VocabularyLevelEstimator:
|
class VocabularyLevelEstimator:
|
||||||
"""
|
"""
|
||||||
词汇难度评估基类
|
词汇难度评估基类
|
||||||
|
@ -82,13 +19,77 @@ class VocabularyLevelEstimator:
|
||||||
"""
|
"""
|
||||||
_test_raw = None
|
_test_raw = None
|
||||||
_difficulty_dict = None
|
_difficulty_dict = None
|
||||||
|
PICKLE_PATH = 'static/words_and_tests.p' # 默认数据文件路径
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _load_data(cls):
|
def _load_data(cls):
|
||||||
"""延迟加载数据,避免不必要的文件操作"""
|
"""延迟加载数据,避免不必要的文件操作"""
|
||||||
if cls._test_raw is None:
|
if cls._test_raw is None:
|
||||||
cls._test_raw = load_record('words_and_tests.p')
|
cls._test_raw = cls.load_record(cls.PICKLE_PATH)
|
||||||
cls._difficulty_dict = convert_test_type_to_difficulty_level(cls._test_raw)
|
cls._difficulty_dict = cls.convert_test_type_to_difficulty_level(cls._test_raw)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load_record(pickle_fname: str) -> Dict[str, List[str]]:
|
||||||
|
"""
|
||||||
|
加载pickle格式的单词-考试类型数据
|
||||||
|
|
||||||
|
参数:
|
||||||
|
pickle_fname: pickle文件名
|
||||||
|
|
||||||
|
返回:
|
||||||
|
字典格式的单词到考试类型列表的映射
|
||||||
|
|
||||||
|
异常:
|
||||||
|
FileNotFoundError: 当文件不存在时抛出
|
||||||
|
ValueError: 当pickle文件损坏时抛出
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# 文件校验
|
||||||
|
if not os.path.exists(pickle_fname):
|
||||||
|
raise FileNotFoundError(f"词汇数据文件 {pickle_fname} 未找到")
|
||||||
|
if not pickle_fname.endswith('.p'):
|
||||||
|
raise ValueError("仅支持.pickle格式文件")
|
||||||
|
|
||||||
|
with open(pickle_fname, 'rb') as f:
|
||||||
|
return pickle.load(f)
|
||||||
|
except pickle.PickleError as e:
|
||||||
|
raise ValueError(f"Pickle文件 {pickle_fname} 损坏: {str(e)}")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def convert_test_type_to_difficulty_level(d: Dict[str, List[str]]) -> Dict[str, int]:
|
||||||
|
"""
|
||||||
|
将考试类型映射为难度级别
|
||||||
|
|
||||||
|
难度级别定义:
|
||||||
|
0: 未知/未分类
|
||||||
|
4: CET4
|
||||||
|
5: OXFORD3000
|
||||||
|
6: CET6或GRADUATE
|
||||||
|
7: IELTS或OXFORD5000
|
||||||
|
8: BBC
|
||||||
|
|
||||||
|
参数:
|
||||||
|
d: 单词到考试类型列表的映射
|
||||||
|
|
||||||
|
返回:
|
||||||
|
单词到难度级别的映射
|
||||||
|
"""
|
||||||
|
result = {}
|
||||||
|
for word, test_types in d.items():
|
||||||
|
word_lower = word.lower() # 统一小写处理
|
||||||
|
if 'CET4' in test_types:
|
||||||
|
result[word_lower] = 4
|
||||||
|
elif 'OXFORD3000' in test_types:
|
||||||
|
result[word_lower] = 5
|
||||||
|
elif 'CET6' in test_types or 'GRADUATE' in test_types:
|
||||||
|
result[word_lower] = 6
|
||||||
|
elif 'IELTS' in test_types or 'OXFORD5000' in test_types:
|
||||||
|
result[word_lower] = 7
|
||||||
|
elif 'BBC' in test_types:
|
||||||
|
result[word_lower] = 8
|
||||||
|
else:
|
||||||
|
result[word_lower] = 0
|
||||||
|
return result
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_word_level(cls, word: str) -> int:
|
def get_word_level(cls, word: str) -> int:
|
||||||
|
@ -102,27 +103,32 @@ class VocabularyLevelEstimator:
|
||||||
单词的难度级别(0-8)
|
单词的难度级别(0-8)
|
||||||
"""
|
"""
|
||||||
cls._load_data()
|
cls._load_data()
|
||||||
return cls._difficulty_dict.get(word, 0)
|
return cls._difficulty_dict.get(word.lower(), 0)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def reload_data(cls, new_path=None):
|
||||||
|
"""强制重新加载词汇数据"""
|
||||||
|
if new_path:
|
||||||
|
cls.PICKLE_PATH = new_path
|
||||||
|
cls._test_raw = None
|
||||||
|
cls._difficulty_dict = None
|
||||||
|
cls._load_data()
|
||||||
|
|
||||||
|
|
||||||
class UserVocabularyLevel(VocabularyLevelEstimator):
|
class UserVocabularyLevel(VocabularyLevelEstimator):
|
||||||
"""
|
"""用户词汇水平评估"""
|
||||||
用户词汇水平评估
|
|
||||||
根据用户最近查询的单词评估其词汇水平
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, d: Dict[str, List[int]]):
|
def __init__(self, user_data: Dict[str, List[int]]):
|
||||||
"""
|
"""
|
||||||
初始化用户词汇数据
|
初始化用户词汇数据
|
||||||
|
|
||||||
参数:
|
参数:
|
||||||
d: 单词到时间戳列表的映射
|
user_data: 单词到时间戳列表的映射
|
||||||
"""
|
"""
|
||||||
self.d = d
|
|
||||||
# 获取每个单词的最新查询时间并排序
|
# 获取每个单词的最新查询时间并排序
|
||||||
word_time = [(word, max(times)) for word, times in d.items() if times]
|
word_time = [(word, max(times)) for word, times in user_data.items() if times]
|
||||||
sorted_words = sorted(word_time, key=lambda x: x[1], reverse=True)
|
sorted_words = sorted(word_time, key=lambda x: x[1], reverse=True)
|
||||||
self.recent_words = [word for word, _ in sorted_words[:3]]
|
self.recent_words = [word for word, _ in sorted_words[:3]] # 取最近3个单词
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def level(self) -> float:
|
def level(self) -> float:
|
||||||
|
@ -139,10 +145,7 @@ class UserVocabularyLevel(VocabularyLevelEstimator):
|
||||||
|
|
||||||
|
|
||||||
class ArticleVocabularyLevel(VocabularyLevelEstimator):
|
class ArticleVocabularyLevel(VocabularyLevelEstimator):
|
||||||
"""
|
"""文章词汇水平评估"""
|
||||||
文章词汇水平评估
|
|
||||||
根据文章中出现的最高难度单词评估文章词汇水平
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, content: str):
|
def __init__(self, content: str):
|
||||||
"""
|
"""
|
||||||
|
@ -150,19 +153,19 @@ class ArticleVocabularyLevel(VocabularyLevelEstimator):
|
||||||
|
|
||||||
参数:
|
参数:
|
||||||
content: 文章内容字符串
|
content: 文章内容字符串
|
||||||
|
|
||||||
异常:
|
|
||||||
ValueError: 当内容为空或不是字符串时抛出
|
|
||||||
"""
|
"""
|
||||||
if not content or not isinstance(content, str):
|
if not content or not isinstance(content, str):
|
||||||
raise ValueError("文章内容必须是非空字符串")
|
self.top_levels = []
|
||||||
|
return
|
||||||
|
|
||||||
self.content = content
|
# 文本预处理:转换为小写并提取单词
|
||||||
# 提取所有单词并计算难度
|
|
||||||
words = WORD_PATTERN.findall(content.lower())
|
words = WORD_PATTERN.findall(content.lower())
|
||||||
|
|
||||||
|
# 计算单词难度并筛选有效值
|
||||||
word_levels = [self.get_word_level(word) for word in words]
|
word_levels = [self.get_word_level(word) for word in words]
|
||||||
# 筛选有效难度并排序
|
|
||||||
valid_levels = sorted([lvl for lvl in word_levels if lvl > 0], reverse=True)
|
valid_levels = sorted([lvl for lvl in word_levels if lvl > 0], reverse=True)
|
||||||
|
|
||||||
|
# 取难度最高的5个单词
|
||||||
self.top_levels = valid_levels[:5] if valid_levels else []
|
self.top_levels = valid_levels[:5] if valid_levels else []
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -176,4 +179,18 @@ class ArticleVocabularyLevel(VocabularyLevelEstimator):
|
||||||
"""
|
"""
|
||||||
if not self.top_levels:
|
if not self.top_levels:
|
||||||
return 0
|
return 0
|
||||||
return sum(self.top_levels) / len(self.top_levels)
|
return sum(self.top_levels) / len(self.top_levels)
|
||||||
|
|
||||||
|
def word_frequency(self, top_n=10) -> Dict[str, int]:
|
||||||
|
"""
|
||||||
|
获取文章词频统计
|
||||||
|
|
||||||
|
参数:
|
||||||
|
top_n: 返回的最高频单词数量
|
||||||
|
|
||||||
|
返回:
|
||||||
|
词频最高的top_n个单词及其频率
|
||||||
|
"""
|
||||||
|
words = WORD_PATTERN.findall(self.content.lower())
|
||||||
|
word_freq = Counter(words)
|
||||||
|
return dict(word_freq.most_common(top_n))
|
||||||
|
|
Loading…
Reference in New Issue