EnglishPal/app/vocabulary.py

123 lines
3.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

'''
Estimate a user's vocabulary level given his vocabulary data
Estimate an English article's difficulty level given its content
Preliminary design
Hui, 2024-09-23
Last upated: 2024-09-25, 2024-09-30
'''
import pickle
import re
from collections import defaultdict
def load_record(pickle_fname):
with open(pickle_fname, 'rb') as f:
d = pickle.load(f)
return d
class VocabularyLevelEstimator:
_test = load_record('words_and_tests.p') # 单词到来源的映射
_source_levels = { # 来源到难度分数的映射
'BBC': 1,
'CET4': 2,
'CET6': 3,
'GRADUATE': 4,
'OXFORD3000': 1,
'TOEFL': 5,
'IELTS': 5,
'GRE': 7
}
def get_word_level(self, word):
"""获取单词难度分数"""
if word in self._test:
sources = self._test[word]
word_levels = [
self._source_levels[src]
for src in sources
if src in self._source_levels
]
if word_levels:
# 使用最高分
return max(word_levels)
return 0 # 未知单词难度为0
class UserVocabularyLevel(VocabularyLevelEstimator):
def __init__(self, d, recent_count=3):
self.d = d
# 按时间戳排序(最新的在前)
sorted_words = sorted(d.items(), key=lambda x: max(x[1]), reverse=True)
# 取最近的单词默认3个
self.word_lst = [word for word, _ in sorted_words[:recent_count]]
@property
def level(self):
if not self.word_lst:
return 0.0
# 使用最高分
max_score = 0
for word in self.word_lst:
score = self.get_word_level(word)
if score > max_score:
max_score = score
return max_score
class ArticleVocabularyLevel(VocabularyLevelEstimator):
def __init__(self, content):
self.content = content
# 更智能的分词,处理连字符和缩写
words = re.findall(r'\b[\w-]+\b', content.lower())
# 计算每个单词的频率和分数
word_freq = defaultdict(int)
word_scores = {}
for word in words:
if word.isalpha():
word_freq[word] += 1
if word not in word_scores:
word_scores[word] = self.get_word_level(word)
# 计算加权分数(频率 * 分数)
weighted_scores = []
for word, score in word_scores.items():
if score > 0:
weighted_scores.append((score * word_freq[word], score, word))
# 如果没有有效单词,直接返回
if not weighted_scores:
self.difficult_words = []
return
# 按加权分数排序
weighted_scores.sort(reverse=True)
# 只保留前20%的单词至少5个最多15个
num_top_words = max(5, min(15, len(weighted_scores) // 5))
self.difficult_words = [score for _, score, _ in weighted_scores[:num_top_words]]
@property
def level(self):
if not self.difficult_words:
return 0.0
# 使用最高分
return max(self.difficult_words)
if __name__ == '__main__':
d = load_record('frequency_mrlan85.pickle')
print(d)
user = UserVocabularyLevel(d)
print(user.level) # level is a property
article = ArticleVocabularyLevel('This is an interesting article')
print(article.level)