diff --git a/app/vocabulary.py b/app/vocabulary.py index 09f78ce..d6b2bec 100644 --- a/app/vocabulary.py +++ b/app/vocabulary.py @@ -1,152 +1,69 @@ -''' +''' Estimate a user's vocabulary level given his vocabulary data Estimate an English article's difficulty level given its content - Fixed: Compatibility with test cases while retaining optimizations - Hui, 2024-09-23 (Last updated: 2025-06-04) + Preliminary design + + Hui, 2024-09-23 + Last upated: 2024-09-25, 2024-09-30 ''' -import string -from app.wordfreqCMD import remove_punctuation # 重用标点处理函数 -import re +import pickle -# ------------------------ 常量定义 ------------------------ -VALID_COUNT_BONUS_FACTOR = 100 # 替代魔术数字100 -MIN_VALID_WORDS = 1 # 最小有效词汇数 -DEFAULT_DIFFICULTY = 3 # 默认难度(非零值) -# ------------------------ 测试数据 ------------------------ -_TEST_VOCAB = { +def load_record(pickle_fname): + with open(pickle_fname, 'rb') as f: + d = pickle.load(f) + return d + + +_TEST_MOCK = { 'simple': 2, 'apple': 1, 'happy': 2, 'open': 3, 'like': 2, 'work': 2, 'make': 2, 'money': 2, 'source': 3, 'software': 3, 'successful': 4, 'project': 3, 'develop': 3, 'process': 3, 'available': 4, 'organizations': 4, 'extinct': 6, 'modification': 7, 'apparently': 7, 'abruptly': 7, 'rentable': 7, 'predictable:': 6, 'pasture': 7, 'putrid': 7, 'frivolous': 8, 'sessile': 8, 'dearth': 7, 'presumptuous': 7, 'fringe': 8, 'economics': 5, 'summarize': 5, 'stare': 5, 'eagerly': 5, 'completely': 4, 'maintained,': 5, - 'geological': 5, 'embryological': 7, 'coadaptation': 8, 'exterminated': 7, 'contingencies': 7, + 'geological': 6, 'embryological': 7, 'coadaptation': 8, 'exterminated': 7, 'contingencies': 7, 'intercrossing': 6, 'coleopterous': 8, 'marin': 5, 'organised': 5, 'monopoly': 8, 'inorganic': 7, 'xyz': 0, '': 0 } -# ------------------------ 核心逻辑类 ------------------------ class VocabularyLevelEstimator: - """词汇水平评估基类""" + _test = _TEST_MOCK - def __init__(self): - self._test = _TEST_VOCAB # 使用硬编码测试数据 - - def _calculate_level_base(self, word_list): - """基础计算逻辑(处理通用验证和计算)""" + @property + def level(self): total = 0.0 valid_count = 0 - - for word in word_list: - # 仅过滤空字符串,保留其他单词(包括测试数据未收录的) - if not word: - continue - - # 修改 _calculate_level_base 中的难度获取逻辑 - difficulty = self._test.get(word.lower(), None) # 默认值改为 None - if difficulty is not None and difficulty > 0: # 仅当难度存在且大于0时计数 + for word in self.word_lst: + if word in self._test: + total += self._test[word] valid_count += 1 - total += difficulty - elif difficulty is None: # 测试数据未收录的单词,不参与计算(默认不视为有效词) - pass - - # 输入验证:至少有一个有效词汇(非空单词) - if valid_count < MIN_VALID_WORDS: - return 0 # 返回0而不是抛出异常,以兼容测试用例 - - # 计算附加分(保留原始逻辑) - if total > 0: - total += (valid_count ** 2) / VALID_COUNT_BONUS_FACTOR - - return total / valid_count - - @property - def level(self): - """计算词汇水平(需由子类提供word_list)""" - try: - return self._calculate_level_base(self.word_list) - except AttributeError: - raise NotImplementedError("子类需实现word_list属性") + # if valid_count >= 40: total += 10 + print(f'valid_count: {valid_count}, total: {total}') + if valid_count != 0 and total != 0: total += (valid_count * valid_count) / 100 + return total / valid_count if valid_count > 0 else 0 -# ------------------------ 用户词汇水平评估 ------------------------ class UserVocabularyLevel(VocabularyLevelEstimator): - """根据用户词汇数据评估水平""" - - def __init__(self, user_vocab_data): - """ - :param user_vocab_data: 用户词汇数据(键:单词,值:任意数据) - """ - super().__init__() - # 提取非空单词(允许测试数据未收录的单词) - self.word_list = [word for word in user_vocab_data.keys() if word] - - @property - def level(self): - """重写计算逻辑:使用用户词汇列表""" - print(f"评估用户词汇(单词数:{len(self.word_list)})") - return super()._calculate_level_base(self.word_list) + def __init__(self, d): + self.d = d + self.word_lst = list(d.keys()) + # just look at the most recently-added words -# ------------------------ 文章难度评估 ------------------------ class ArticleVocabularyLevel(VocabularyLevelEstimator): - """根据文章内容评估难度""" - def __init__(self, content): - """ - :param content: 文章内容文本 - """ - super().__init__() - self.clean_content = self._preprocess_content(content) - self.word_list = self._extract_key_words(self.clean_content) - - def _preprocess_content(self, content): - """文本预处理:去标点、转小写、提取纯字母单词""" - if not content: - return "" - # 先使用现有标点处理函数 - processed = remove_punctuation(content) - # 再用正则表达式提取纯字母单词(\b 表示单词边界,确保单词仅由字母组成) - words = re.findall(r'\b[a-zA-Z]+\b', processed.lower()) - return ' '.join(words) # 转换回字符串以便后续处理 - - def _extract_key_words(self, content): - """提取关键单词(按难度排序取前10个)""" - words = [word for word in content.split() if word] # 保留非空单词 - if not words: - return [] # 返回空列表而不是抛出异常 - - # 按难度排序(测试数据未收录的单词默认难度为DEFAULT_DIFFICULTY) - ranked = sorted(words, key=lambda w: self._test.get(w, DEFAULT_DIFFICULTY), reverse=True) - return ranked[:10] # 保留前10个最难单词 - - @property - def level(self): - """重写计算逻辑:使用文章关键单词列表""" - print(f"评估文章难度(关键单词数:{len(self.word_list)})") - return super()._calculate_level_base(self.word_list) + self.content = content + self.word_lst = content.lower().split() + # select the 10 most difficult words -# ------------------------ 示例运行 ------------------------ if __name__ == '__main__': - # 模拟用户词汇数据(包含测试数据中的有效单词) - user_vocab = { - 'apple': 5, # 测试数据中存在,难度1 - 'happy': 3, # 测试数据中存在,难度2 - 'successful': 2, # 测试数据中存在,难度4 - 'project': 1, # 测试数据中存在,难度3 - 'new_word': 1 # 测试数据中不存在,默认难度3 - } - - user_estimator = UserVocabularyLevel(user_vocab) - user_level = user_estimator.level - print(f"用户词汇水平:{user_level:.2f}") - - # 文章难度评估(包含新单词) - article_content = "This is a new article with unknown words." - article_estimator = ArticleVocabularyLevel(article_content) - article_level = article_estimator.level - print(f"文章难度等级:{article_level:.2f}") + d = load_record('frequency_mrlan85.pickle') + print(d) + user = UserVocabularyLevel(d) + print(user.level) # level is a property + article = ArticleVocabularyLevel('This is an interesting article') + print(article.level)