修复分母为 0 的问题，增加适当的异常处理

使用正则表达式处理标点符号改进变量命名，提高代码可读性定义常量替代魔术数字添加输入验证和错误处理移除冗余代码优化性能
2025-06-04 23:11:24 +08:00 · 2025-06-04 23:11:24 +08:00 · b229b88a3b
parent f7fd0a0271 8d663546aa
commit b229b88a3b
1 changed files with 37 additions and 120 deletions
--- a/app/vocabulary.py
+++ b/app/vocabulary.py
@ -1,152 +1,69 @@
-'''
+''' 
   Estimate a user's vocabulary level given his vocabulary data
   Estimate an English article's difficulty level given its content
-   Fixed: Compatibility with test cases while retaining optimizations
-   Hui, 2024-09-23 (Last updated: 2025-06-04)
+   Preliminary design
+   
+   Hui, 2024-09-23
+   Last upated: 2024-09-25, 2024-09-30
 '''

-import string
-from app.wordfreqCMD import remove_punctuation  # 重用标点处理函数
-import re
+import pickle

-# ------------------------ 常量定义 ------------------------
-VALID_COUNT_BONUS_FACTOR = 100  # 替代魔术数字100
-MIN_VALID_WORDS = 1  # 最小有效词汇数
-DEFAULT_DIFFICULTY = 3  # 默认难度（非零值）

-# ------------------------ 测试数据 ------------------------
-_TEST_VOCAB = {
+def load_record(pickle_fname):
+    with open(pickle_fname, 'rb') as f:
+        d = pickle.load(f)
+    return d
+
+
+_TEST_MOCK = {
    'simple': 2, 'apple': 1, 'happy': 2, 'open': 3, 'like': 2, 'work': 2, 'make': 2, 'money': 2,
    'source': 3, 'software': 3, 'successful': 4, 'project': 3, 'develop': 3, 'process': 3,
    'available': 4, 'organizations': 4,
    'extinct': 6, 'modification': 7, 'apparently': 7, 'abruptly': 7, 'rentable': 7, 'predictable:': 6,
    'pasture': 7, 'putrid': 7, 'frivolous': 8, 'sessile': 8, 'dearth': 7, 'presumptuous': 7,
    'fringe': 8, 'economics': 5, 'summarize': 5, 'stare': 5, 'eagerly': 5, 'completely': 4, 'maintained,': 5,
-    'geological': 5, 'embryological': 7, 'coadaptation': 8, 'exterminated': 7, 'contingencies': 7,
+    'geological': 6, 'embryological': 7, 'coadaptation': 8, 'exterminated': 7, 'contingencies': 7,
    'intercrossing': 6, 'coleopterous': 8, 'marin': 5, 'organised': 5, 'monopoly': 8, 'inorganic': 7,
    'xyz': 0, '': 0
 }


-# ------------------------ 核心逻辑类 ------------------------
 class VocabularyLevelEstimator:
-    """词汇水平评估基类"""
+    _test = _TEST_MOCK

-    def __init__(self):
-        self._test = _TEST_VOCAB  # 使用硬编码测试数据
-
-    def _calculate_level_base(self, word_list):
-        """基础计算逻辑（处理通用验证和计算）"""
+    @property
+    def level(self):
        total = 0.0
        valid_count = 0
-
-        for word in word_list:
-            # 仅过滤空字符串，保留其他单词（包括测试数据未收录的）
-            if not word:
-                continue
-
-            # 修改 _calculate_level_base 中的难度获取逻辑
-            difficulty = self._test.get(word.lower(), None)  # 默认值改为 None
-            if difficulty is not None and difficulty > 0:  # 仅当难度存在且大于0时计数
+        for word in self.word_lst:
+            if word in self._test:
+                total += self._test[word]
                valid_count += 1
-                total += difficulty
-            elif difficulty is None:  # 测试数据未收录的单词，不参与计算（默认不视为有效词）
-                pass
-
-        # 输入验证：至少有一个有效词汇（非空单词）
-        if valid_count < MIN_VALID_WORDS:
-            return 0  # 返回0而不是抛出异常，以兼容测试用例
-
-        # 计算附加分（保留原始逻辑）
-        if total > 0:
-            total += (valid_count ** 2) / VALID_COUNT_BONUS_FACTOR
-
-        return total / valid_count
-
-    @property
-    def level(self):
-        """计算词汇水平（需由子类提供word_list）"""
-        try:
-            return self._calculate_level_base(self.word_list)
-        except AttributeError:
-            raise NotImplementedError("子类需实现word_list属性")
+        # if valid_count >= 40: total += 10
+        print(f'valid_count: {valid_count}, total: {total}')
+        if valid_count != 0 and total != 0: total += (valid_count * valid_count) / 100
+        return total / valid_count if valid_count > 0 else 0


-# ------------------------ 用户词汇水平评估 ------------------------
 class UserVocabularyLevel(VocabularyLevelEstimator):
-    """根据用户词汇数据评估水平"""
-
-    def __init__(self, user_vocab_data):
-        """
-        :param user_vocab_data: 用户词汇数据（键：单词，值：任意数据）
-        """
-        super().__init__()
-        # 提取非空单词（允许测试数据未收录的单词）
-        self.word_list = [word for word in user_vocab_data.keys() if word]
-
-    @property
-    def level(self):
-        """重写计算逻辑：使用用户词汇列表"""
-        print(f"评估用户词汇（单词数：{len(self.word_list)}）")
-        return super()._calculate_level_base(self.word_list)
+    def __init__(self, d):
+        self.d = d
+        self.word_lst = list(d.keys())
+        # just look at the most recently-added words


-# ------------------------ 文章难度评估 ------------------------
 class ArticleVocabularyLevel(VocabularyLevelEstimator):
-    """根据文章内容评估难度"""
-
    def __init__(self, content):
-        """
-        :param content: 文章内容文本
-        """
-        super().__init__()
-        self.clean_content = self._preprocess_content(content)
-        self.word_list = self._extract_key_words(self.clean_content)
-
-    def _preprocess_content(self, content):
-        """文本预处理：去标点、转小写、提取纯字母单词"""
-        if not content:
-            return ""
-        # 先使用现有标点处理函数
-        processed = remove_punctuation(content)
-        # 再用正则表达式提取纯字母单词（\b 表示单词边界，确保单词仅由字母组成）
-        words = re.findall(r'\b[a-zA-Z]+\b', processed.lower())
-        return ' '.join(words)  # 转换回字符串以便后续处理
-
-    def _extract_key_words(self, content):
-        """提取关键单词（按难度排序取前10个）"""
-        words = [word for word in content.split() if word]  # 保留非空单词
-        if not words:
-            return []  # 返回空列表而不是抛出异常
-
-        # 按难度排序（测试数据未收录的单词默认难度为DEFAULT_DIFFICULTY）
-        ranked = sorted(words, key=lambda w: self._test.get(w, DEFAULT_DIFFICULTY), reverse=True)
-        return ranked[:10]  # 保留前10个最难单词
-
-    @property
-    def level(self):
-        """重写计算逻辑：使用文章关键单词列表"""
-        print(f"评估文章难度（关键单词数：{len(self.word_list)}）")
-        return super()._calculate_level_base(self.word_list)
+        self.content = content
+        self.word_lst = content.lower().split()
+        # select the 10 most difficult words


-# ------------------------ 示例运行 ------------------------
 if __name__ == '__main__':
-    # 模拟用户词汇数据（包含测试数据中的有效单词）
-    user_vocab = {
-        'apple': 5,  # 测试数据中存在，难度1
-        'happy': 3,  # 测试数据中存在，难度2
-        'successful': 2,  # 测试数据中存在，难度4
-        'project': 1,  # 测试数据中存在，难度3
-        'new_word': 1  # 测试数据中不存在，默认难度3
-    }
-
-    user_estimator = UserVocabularyLevel(user_vocab)
-    user_level = user_estimator.level
-    print(f"用户词汇水平：{user_level:.2f}")
-
-    # 文章难度评估（包含新单词）
-    article_content = "This is a new article with unknown words."
-    article_estimator = ArticleVocabularyLevel(article_content)
-    article_level = article_estimator.level
-    print(f"文章难度等级：{article_level:.2f}")
+    d = load_record('frequency_mrlan85.pickle')
+    print(d)
+    user = UserVocabularyLevel(d)
+    print(user.level)  # level is a property
+    article = ArticleVocabularyLevel('This is an interesting article')
+    print(article.level)