parent
							
								
									52a9ca9677
								
							
						
					
					
						commit
						4a5fc9a7ce
					
				|  | @ -1,76 +1,13 @@ | ||||||
| """ |  | ||||||
| 词汇难度评估系统 |  | ||||||
| 功能:根据单词在不同考试中的出现情况评估其难度级别,并计算用户或文章的词汇水平 |  | ||||||
| """ |  | ||||||
| 
 |  | ||||||
| import re | import re | ||||||
| import pickle | import pickle | ||||||
|  | import os | ||||||
| from typing import Dict, List, Tuple, Union | from typing import Dict, List, Tuple, Union | ||||||
|  | from collections import Counter | ||||||
| 
 | 
 | ||||||
| # 预编译正则表达式提高性能 | # 预编译正则表达式提高性能 | ||||||
| WORD_PATTERN = re.compile(r'\b[\w-]+\b') | WORD_PATTERN = re.compile(r'\b[\w-]+\b') | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def load_record(pickle_fname: str) -> Dict[str, List[str]]: |  | ||||||
|     """ |  | ||||||
|     加载pickle格式的单词-考试类型数据 |  | ||||||
| 
 |  | ||||||
|     参数: |  | ||||||
|     pickle_fname: pickle文件名 |  | ||||||
| 
 |  | ||||||
|     返回: |  | ||||||
|     字典格式的单词到考试类型列表的映射 |  | ||||||
| 
 |  | ||||||
|     异常: |  | ||||||
|     FileNotFoundError: 当文件不存在时抛出 |  | ||||||
|     ValueError: 当pickle文件损坏时抛出 |  | ||||||
|     """ |  | ||||||
|     try: |  | ||||||
|         with open(pickle_fname, 'rb') as f: |  | ||||||
|             return pickle.load(f) |  | ||||||
|     except FileNotFoundError: |  | ||||||
|         raise FileNotFoundError(f"Pickle文件 {pickle_fname} 未找到") |  | ||||||
|     except pickle.PickleError: |  | ||||||
|         raise ValueError(f"Pickle文件 {pickle_fname} 损坏或格式不正确") |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def convert_test_type_to_difficulty_level(d: Dict[str, List[str]]) -> Dict[str, int]: |  | ||||||
|     """ |  | ||||||
|     将考试类型映射为难度级别 |  | ||||||
| 
 |  | ||||||
|     难度级别定义: |  | ||||||
|     0: 未知/未分类 |  | ||||||
|     4: CET4 |  | ||||||
|     5: OXFORD3000 |  | ||||||
|     6: CET6或GRADUATE |  | ||||||
|     7: IELTS或OXFORD5000 |  | ||||||
|     8: BBC |  | ||||||
| 
 |  | ||||||
|     参数: |  | ||||||
|     d: 单词到考试类型列表的映射 |  | ||||||
| 
 |  | ||||||
|     返回: |  | ||||||
|     单词到难度级别的映射 |  | ||||||
|     """ |  | ||||||
|     result = {} |  | ||||||
|     for word, test_types in d.items(): |  | ||||||
|         if 'CET4' in test_types: |  | ||||||
|             result[word] = 4 |  | ||||||
|         elif 'OXFORD3000' in test_types: |  | ||||||
|             result[word] = 5 |  | ||||||
|         elif 'CET6' in test_types or 'GRADUATE' in test_types: |  | ||||||
|             result[word] = 6 |  | ||||||
|         elif 'IELTS' in test_types: |  | ||||||
|             result[word] = 7 |  | ||||||
|         elif 'OXFORD5000' in test_types: |  | ||||||
|             result[word] = 7 |  | ||||||
|         elif 'BBC' in test_types: |  | ||||||
|             result[word] = 8 |  | ||||||
|         else: |  | ||||||
|             result[word] = 0 |  | ||||||
|     return result |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class VocabularyLevelEstimator: | class VocabularyLevelEstimator: | ||||||
|     """ |     """ | ||||||
|     词汇难度评估基类 |     词汇难度评估基类 | ||||||
|  | @ -82,13 +19,77 @@ class VocabularyLevelEstimator: | ||||||
|     """ |     """ | ||||||
|     _test_raw = None |     _test_raw = None | ||||||
|     _difficulty_dict = None |     _difficulty_dict = None | ||||||
|  |     PICKLE_PATH = 'static/words_and_tests.p'  # 默认数据文件路径 | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def _load_data(cls): |     def _load_data(cls): | ||||||
|         """延迟加载数据,避免不必要的文件操作""" |         """延迟加载数据,避免不必要的文件操作""" | ||||||
|         if cls._test_raw is None: |         if cls._test_raw is None: | ||||||
|             cls._test_raw = load_record('words_and_tests.p') |             cls._test_raw = cls.load_record(cls.PICKLE_PATH) | ||||||
|             cls._difficulty_dict = convert_test_type_to_difficulty_level(cls._test_raw) |             cls._difficulty_dict = cls.convert_test_type_to_difficulty_level(cls._test_raw) | ||||||
|  | 
 | ||||||
|  |     @staticmethod | ||||||
|  |     def load_record(pickle_fname: str) -> Dict[str, List[str]]: | ||||||
|  |         """ | ||||||
|  |         加载pickle格式的单词-考试类型数据 | ||||||
|  | 
 | ||||||
|  |         参数: | ||||||
|  |         pickle_fname: pickle文件名 | ||||||
|  | 
 | ||||||
|  |         返回: | ||||||
|  |         字典格式的单词到考试类型列表的映射 | ||||||
|  | 
 | ||||||
|  |         异常: | ||||||
|  |         FileNotFoundError: 当文件不存在时抛出 | ||||||
|  |         ValueError: 当pickle文件损坏时抛出 | ||||||
|  |         """ | ||||||
|  |         try: | ||||||
|  |             # 文件校验 | ||||||
|  |             if not os.path.exists(pickle_fname): | ||||||
|  |                 raise FileNotFoundError(f"词汇数据文件 {pickle_fname} 未找到") | ||||||
|  |             if not pickle_fname.endswith('.p'): | ||||||
|  |                 raise ValueError("仅支持.pickle格式文件") | ||||||
|  | 
 | ||||||
|  |             with open(pickle_fname, 'rb') as f: | ||||||
|  |                 return pickle.load(f) | ||||||
|  |         except pickle.PickleError as e: | ||||||
|  |             raise ValueError(f"Pickle文件 {pickle_fname} 损坏: {str(e)}") | ||||||
|  | 
 | ||||||
|  |     @staticmethod | ||||||
|  |     def convert_test_type_to_difficulty_level(d: Dict[str, List[str]]) -> Dict[str, int]: | ||||||
|  |         """ | ||||||
|  |         将考试类型映射为难度级别 | ||||||
|  | 
 | ||||||
|  |         难度级别定义: | ||||||
|  |         0: 未知/未分类 | ||||||
|  |         4: CET4 | ||||||
|  |         5: OXFORD3000 | ||||||
|  |         6: CET6或GRADUATE | ||||||
|  |         7: IELTS或OXFORD5000 | ||||||
|  |         8: BBC | ||||||
|  | 
 | ||||||
|  |         参数: | ||||||
|  |         d: 单词到考试类型列表的映射 | ||||||
|  | 
 | ||||||
|  |         返回: | ||||||
|  |         单词到难度级别的映射 | ||||||
|  |         """ | ||||||
|  |         result = {} | ||||||
|  |         for word, test_types in d.items(): | ||||||
|  |             word_lower = word.lower()  # 统一小写处理 | ||||||
|  |             if 'CET4' in test_types: | ||||||
|  |                 result[word_lower] = 4 | ||||||
|  |             elif 'OXFORD3000' in test_types: | ||||||
|  |                 result[word_lower] = 5 | ||||||
|  |             elif 'CET6' in test_types or 'GRADUATE' in test_types: | ||||||
|  |                 result[word_lower] = 6 | ||||||
|  |             elif 'IELTS' in test_types or 'OXFORD5000' in test_types: | ||||||
|  |                 result[word_lower] = 7 | ||||||
|  |             elif 'BBC' in test_types: | ||||||
|  |                 result[word_lower] = 8 | ||||||
|  |             else: | ||||||
|  |                 result[word_lower] = 0 | ||||||
|  |         return result | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def get_word_level(cls, word: str) -> int: |     def get_word_level(cls, word: str) -> int: | ||||||
|  | @ -102,27 +103,32 @@ class VocabularyLevelEstimator: | ||||||
|         单词的难度级别(0-8) |         单词的难度级别(0-8) | ||||||
|         """ |         """ | ||||||
|         cls._load_data() |         cls._load_data() | ||||||
|         return cls._difficulty_dict.get(word, 0) |         return cls._difficulty_dict.get(word.lower(), 0) | ||||||
|  | 
 | ||||||
|  |     @classmethod | ||||||
|  |     def reload_data(cls, new_path=None): | ||||||
|  |         """强制重新加载词汇数据""" | ||||||
|  |         if new_path: | ||||||
|  |             cls.PICKLE_PATH = new_path | ||||||
|  |         cls._test_raw = None | ||||||
|  |         cls._difficulty_dict = None | ||||||
|  |         cls._load_data() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class UserVocabularyLevel(VocabularyLevelEstimator): | class UserVocabularyLevel(VocabularyLevelEstimator): | ||||||
|     """ |     """用户词汇水平评估""" | ||||||
|     用户词汇水平评估 |  | ||||||
|     根据用户最近查询的单词评估其词汇水平 |  | ||||||
|     """ |  | ||||||
| 
 | 
 | ||||||
|     def __init__(self, d: Dict[str, List[int]]): |     def __init__(self, user_data: Dict[str, List[int]]): | ||||||
|         """ |         """ | ||||||
|         初始化用户词汇数据 |         初始化用户词汇数据 | ||||||
| 
 | 
 | ||||||
|         参数: |         参数: | ||||||
|         d: 单词到时间戳列表的映射 |         user_data: 单词到时间戳列表的映射 | ||||||
|         """ |         """ | ||||||
|         self.d = d |  | ||||||
|         # 获取每个单词的最新查询时间并排序 |         # 获取每个单词的最新查询时间并排序 | ||||||
|         word_time = [(word, max(times)) for word, times in d.items() if times] |         word_time = [(word, max(times)) for word, times in user_data.items() if times] | ||||||
|         sorted_words = sorted(word_time, key=lambda x: x[1], reverse=True) |         sorted_words = sorted(word_time, key=lambda x: x[1], reverse=True) | ||||||
|         self.recent_words = [word for word, _ in sorted_words[:3]] |         self.recent_words = [word for word, _ in sorted_words[:3]]  # 取最近3个单词 | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|     def level(self) -> float: |     def level(self) -> float: | ||||||
|  | @ -139,10 +145,7 @@ class UserVocabularyLevel(VocabularyLevelEstimator): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class ArticleVocabularyLevel(VocabularyLevelEstimator): | class ArticleVocabularyLevel(VocabularyLevelEstimator): | ||||||
|     """ |     """文章词汇水平评估""" | ||||||
|     文章词汇水平评估 |  | ||||||
|     根据文章中出现的最高难度单词评估文章词汇水平 |  | ||||||
|     """ |  | ||||||
| 
 | 
 | ||||||
|     def __init__(self, content: str): |     def __init__(self, content: str): | ||||||
|         """ |         """ | ||||||
|  | @ -150,19 +153,19 @@ class ArticleVocabularyLevel(VocabularyLevelEstimator): | ||||||
| 
 | 
 | ||||||
|         参数: |         参数: | ||||||
|         content: 文章内容字符串 |         content: 文章内容字符串 | ||||||
| 
 |  | ||||||
|         异常: |  | ||||||
|         ValueError: 当内容为空或不是字符串时抛出 |  | ||||||
|         """ |         """ | ||||||
|         if not content or not isinstance(content, str): |         if not content or not isinstance(content, str): | ||||||
|             raise ValueError("文章内容必须是非空字符串") |             self.top_levels = [] | ||||||
|  |             return | ||||||
| 
 | 
 | ||||||
|         self.content = content |         # 文本预处理:转换为小写并提取单词 | ||||||
|         # 提取所有单词并计算难度 |  | ||||||
|         words = WORD_PATTERN.findall(content.lower()) |         words = WORD_PATTERN.findall(content.lower()) | ||||||
|  | 
 | ||||||
|  |         # 计算单词难度并筛选有效值 | ||||||
|         word_levels = [self.get_word_level(word) for word in words] |         word_levels = [self.get_word_level(word) for word in words] | ||||||
|         # 筛选有效难度并排序 |  | ||||||
|         valid_levels = sorted([lvl for lvl in word_levels if lvl > 0], reverse=True) |         valid_levels = sorted([lvl for lvl in word_levels if lvl > 0], reverse=True) | ||||||
|  | 
 | ||||||
|  |         # 取难度最高的5个单词 | ||||||
|         self.top_levels = valid_levels[:5] if valid_levels else [] |         self.top_levels = valid_levels[:5] if valid_levels else [] | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|  | @ -176,4 +179,18 @@ class ArticleVocabularyLevel(VocabularyLevelEstimator): | ||||||
|         """ |         """ | ||||||
|         if not self.top_levels: |         if not self.top_levels: | ||||||
|             return 0 |             return 0 | ||||||
|         return sum(self.top_levels) / len(self.top_levels) |         return sum(self.top_levels) / len(self.top_levels) | ||||||
|  | 
 | ||||||
|  |     def word_frequency(self, top_n=10) -> Dict[str, int]: | ||||||
|  |         """ | ||||||
|  |         获取文章词频统计 | ||||||
|  | 
 | ||||||
|  |         参数: | ||||||
|  |         top_n: 返回的最高频单词数量 | ||||||
|  | 
 | ||||||
|  |         返回: | ||||||
|  |         词频最高的top_n个单词及其频率 | ||||||
|  |         """ | ||||||
|  |         words = WORD_PATTERN.findall(self.content.lower()) | ||||||
|  |         word_freq = Counter(words) | ||||||
|  |         return dict(word_freq.most_common(top_n)) | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue