修复分母为 0 的问题,增加适当的异常处理
使用正则表达式处理标点符号 改进变量命名,提高代码可读性 定义常量替代魔术数字 添加输入验证和错误处理 移除冗余代码 优化性能pull/208/head
commit
b229b88a3b
|
@ -1,152 +1,69 @@
|
||||||
'''
|
'''
|
||||||
Estimate a user's vocabulary level given his vocabulary data
|
Estimate a user's vocabulary level given his vocabulary data
|
||||||
Estimate an English article's difficulty level given its content
|
Estimate an English article's difficulty level given its content
|
||||||
Fixed: Compatibility with test cases while retaining optimizations
|
Preliminary design
|
||||||
Hui, 2024-09-23 (Last updated: 2025-06-04)
|
|
||||||
|
Hui, 2024-09-23
|
||||||
|
Last upated: 2024-09-25, 2024-09-30
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import string
|
import pickle
|
||||||
from app.wordfreqCMD import remove_punctuation # 重用标点处理函数
|
|
||||||
import re
|
|
||||||
|
|
||||||
# ------------------------ 常量定义 ------------------------
|
|
||||||
VALID_COUNT_BONUS_FACTOR = 100 # 替代魔术数字100
|
|
||||||
MIN_VALID_WORDS = 1 # 最小有效词汇数
|
|
||||||
DEFAULT_DIFFICULTY = 3 # 默认难度(非零值)
|
|
||||||
|
|
||||||
# ------------------------ 测试数据 ------------------------
|
def load_record(pickle_fname):
|
||||||
_TEST_VOCAB = {
|
with open(pickle_fname, 'rb') as f:
|
||||||
|
d = pickle.load(f)
|
||||||
|
return d
|
||||||
|
|
||||||
|
|
||||||
|
_TEST_MOCK = {
|
||||||
'simple': 2, 'apple': 1, 'happy': 2, 'open': 3, 'like': 2, 'work': 2, 'make': 2, 'money': 2,
|
'simple': 2, 'apple': 1, 'happy': 2, 'open': 3, 'like': 2, 'work': 2, 'make': 2, 'money': 2,
|
||||||
'source': 3, 'software': 3, 'successful': 4, 'project': 3, 'develop': 3, 'process': 3,
|
'source': 3, 'software': 3, 'successful': 4, 'project': 3, 'develop': 3, 'process': 3,
|
||||||
'available': 4, 'organizations': 4,
|
'available': 4, 'organizations': 4,
|
||||||
'extinct': 6, 'modification': 7, 'apparently': 7, 'abruptly': 7, 'rentable': 7, 'predictable:': 6,
|
'extinct': 6, 'modification': 7, 'apparently': 7, 'abruptly': 7, 'rentable': 7, 'predictable:': 6,
|
||||||
'pasture': 7, 'putrid': 7, 'frivolous': 8, 'sessile': 8, 'dearth': 7, 'presumptuous': 7,
|
'pasture': 7, 'putrid': 7, 'frivolous': 8, 'sessile': 8, 'dearth': 7, 'presumptuous': 7,
|
||||||
'fringe': 8, 'economics': 5, 'summarize': 5, 'stare': 5, 'eagerly': 5, 'completely': 4, 'maintained,': 5,
|
'fringe': 8, 'economics': 5, 'summarize': 5, 'stare': 5, 'eagerly': 5, 'completely': 4, 'maintained,': 5,
|
||||||
'geological': 5, 'embryological': 7, 'coadaptation': 8, 'exterminated': 7, 'contingencies': 7,
|
'geological': 6, 'embryological': 7, 'coadaptation': 8, 'exterminated': 7, 'contingencies': 7,
|
||||||
'intercrossing': 6, 'coleopterous': 8, 'marin': 5, 'organised': 5, 'monopoly': 8, 'inorganic': 7,
|
'intercrossing': 6, 'coleopterous': 8, 'marin': 5, 'organised': 5, 'monopoly': 8, 'inorganic': 7,
|
||||||
'xyz': 0, '': 0
|
'xyz': 0, '': 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# ------------------------ 核心逻辑类 ------------------------
|
|
||||||
class VocabularyLevelEstimator:
|
class VocabularyLevelEstimator:
|
||||||
"""词汇水平评估基类"""
|
_test = _TEST_MOCK
|
||||||
|
|
||||||
def __init__(self):
|
@property
|
||||||
self._test = _TEST_VOCAB # 使用硬编码测试数据
|
def level(self):
|
||||||
|
|
||||||
def _calculate_level_base(self, word_list):
|
|
||||||
"""基础计算逻辑(处理通用验证和计算)"""
|
|
||||||
total = 0.0
|
total = 0.0
|
||||||
valid_count = 0
|
valid_count = 0
|
||||||
|
for word in self.word_lst:
|
||||||
for word in word_list:
|
if word in self._test:
|
||||||
# 仅过滤空字符串,保留其他单词(包括测试数据未收录的)
|
total += self._test[word]
|
||||||
if not word:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# 修改 _calculate_level_base 中的难度获取逻辑
|
|
||||||
difficulty = self._test.get(word.lower(), None) # 默认值改为 None
|
|
||||||
if difficulty is not None and difficulty > 0: # 仅当难度存在且大于0时计数
|
|
||||||
valid_count += 1
|
valid_count += 1
|
||||||
total += difficulty
|
# if valid_count >= 40: total += 10
|
||||||
elif difficulty is None: # 测试数据未收录的单词,不参与计算(默认不视为有效词)
|
print(f'valid_count: {valid_count}, total: {total}')
|
||||||
pass
|
if valid_count != 0 and total != 0: total += (valid_count * valid_count) / 100
|
||||||
|
return total / valid_count if valid_count > 0 else 0
|
||||||
# 输入验证:至少有一个有效词汇(非空单词)
|
|
||||||
if valid_count < MIN_VALID_WORDS:
|
|
||||||
return 0 # 返回0而不是抛出异常,以兼容测试用例
|
|
||||||
|
|
||||||
# 计算附加分(保留原始逻辑)
|
|
||||||
if total > 0:
|
|
||||||
total += (valid_count ** 2) / VALID_COUNT_BONUS_FACTOR
|
|
||||||
|
|
||||||
return total / valid_count
|
|
||||||
|
|
||||||
@property
|
|
||||||
def level(self):
|
|
||||||
"""计算词汇水平(需由子类提供word_list)"""
|
|
||||||
try:
|
|
||||||
return self._calculate_level_base(self.word_list)
|
|
||||||
except AttributeError:
|
|
||||||
raise NotImplementedError("子类需实现word_list属性")
|
|
||||||
|
|
||||||
|
|
||||||
# ------------------------ 用户词汇水平评估 ------------------------
|
|
||||||
class UserVocabularyLevel(VocabularyLevelEstimator):
|
class UserVocabularyLevel(VocabularyLevelEstimator):
|
||||||
"""根据用户词汇数据评估水平"""
|
def __init__(self, d):
|
||||||
|
self.d = d
|
||||||
def __init__(self, user_vocab_data):
|
self.word_lst = list(d.keys())
|
||||||
"""
|
# just look at the most recently-added words
|
||||||
:param user_vocab_data: 用户词汇数据(键:单词,值:任意数据)
|
|
||||||
"""
|
|
||||||
super().__init__()
|
|
||||||
# 提取非空单词(允许测试数据未收录的单词)
|
|
||||||
self.word_list = [word for word in user_vocab_data.keys() if word]
|
|
||||||
|
|
||||||
@property
|
|
||||||
def level(self):
|
|
||||||
"""重写计算逻辑:使用用户词汇列表"""
|
|
||||||
print(f"评估用户词汇(单词数:{len(self.word_list)})")
|
|
||||||
return super()._calculate_level_base(self.word_list)
|
|
||||||
|
|
||||||
|
|
||||||
# ------------------------ 文章难度评估 ------------------------
|
|
||||||
class ArticleVocabularyLevel(VocabularyLevelEstimator):
|
class ArticleVocabularyLevel(VocabularyLevelEstimator):
|
||||||
"""根据文章内容评估难度"""
|
|
||||||
|
|
||||||
def __init__(self, content):
|
def __init__(self, content):
|
||||||
"""
|
self.content = content
|
||||||
:param content: 文章内容文本
|
self.word_lst = content.lower().split()
|
||||||
"""
|
# select the 10 most difficult words
|
||||||
super().__init__()
|
|
||||||
self.clean_content = self._preprocess_content(content)
|
|
||||||
self.word_list = self._extract_key_words(self.clean_content)
|
|
||||||
|
|
||||||
def _preprocess_content(self, content):
|
|
||||||
"""文本预处理:去标点、转小写、提取纯字母单词"""
|
|
||||||
if not content:
|
|
||||||
return ""
|
|
||||||
# 先使用现有标点处理函数
|
|
||||||
processed = remove_punctuation(content)
|
|
||||||
# 再用正则表达式提取纯字母单词(\b 表示单词边界,确保单词仅由字母组成)
|
|
||||||
words = re.findall(r'\b[a-zA-Z]+\b', processed.lower())
|
|
||||||
return ' '.join(words) # 转换回字符串以便后续处理
|
|
||||||
|
|
||||||
def _extract_key_words(self, content):
|
|
||||||
"""提取关键单词(按难度排序取前10个)"""
|
|
||||||
words = [word for word in content.split() if word] # 保留非空单词
|
|
||||||
if not words:
|
|
||||||
return [] # 返回空列表而不是抛出异常
|
|
||||||
|
|
||||||
# 按难度排序(测试数据未收录的单词默认难度为DEFAULT_DIFFICULTY)
|
|
||||||
ranked = sorted(words, key=lambda w: self._test.get(w, DEFAULT_DIFFICULTY), reverse=True)
|
|
||||||
return ranked[:10] # 保留前10个最难单词
|
|
||||||
|
|
||||||
@property
|
|
||||||
def level(self):
|
|
||||||
"""重写计算逻辑:使用文章关键单词列表"""
|
|
||||||
print(f"评估文章难度(关键单词数:{len(self.word_list)})")
|
|
||||||
return super()._calculate_level_base(self.word_list)
|
|
||||||
|
|
||||||
|
|
||||||
# ------------------------ 示例运行 ------------------------
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# 模拟用户词汇数据(包含测试数据中的有效单词)
|
d = load_record('frequency_mrlan85.pickle')
|
||||||
user_vocab = {
|
print(d)
|
||||||
'apple': 5, # 测试数据中存在,难度1
|
user = UserVocabularyLevel(d)
|
||||||
'happy': 3, # 测试数据中存在,难度2
|
print(user.level) # level is a property
|
||||||
'successful': 2, # 测试数据中存在,难度4
|
article = ArticleVocabularyLevel('This is an interesting article')
|
||||||
'project': 1, # 测试数据中存在,难度3
|
print(article.level)
|
||||||
'new_word': 1 # 测试数据中不存在,默认难度3
|
|
||||||
}
|
|
||||||
|
|
||||||
user_estimator = UserVocabularyLevel(user_vocab)
|
|
||||||
user_level = user_estimator.level
|
|
||||||
print(f"用户词汇水平:{user_level:.2f}")
|
|
||||||
|
|
||||||
# 文章难度评估(包含新单词)
|
|
||||||
article_content = "This is a new article with unknown words."
|
|
||||||
article_estimator = ArticleVocabularyLevel(article_content)
|
|
||||||
article_level = article_estimator.level
|
|
||||||
print(f"文章难度等级:{article_level:.2f}")
|
|
||||||
|
|
Loading…
Reference in New Issue