123 lines
3.4 KiB
Python
123 lines
3.4 KiB
Python
|
'''
|
|||
|
Estimate a user's vocabulary level given his vocabulary data
|
|||
|
Estimate an English article's difficulty level given its content
|
|||
|
Preliminary design
|
|||
|
|
|||
|
Hui, 2024-09-23
|
|||
|
Last upated: 2024-09-25, 2024-09-30
|
|||
|
'''
|
|||
|
|
|||
|
import pickle
|
|||
|
import re
|
|||
|
from collections import defaultdict
|
|||
|
|
|||
|
|
|||
|
def load_record(pickle_fname):
|
|||
|
with open(pickle_fname, 'rb') as f:
|
|||
|
d = pickle.load(f)
|
|||
|
return d
|
|||
|
|
|||
|
|
|||
|
class VocabularyLevelEstimator:
|
|||
|
_test = load_record('words_and_tests.p') # 单词到来源的映射
|
|||
|
_source_levels = { # 来源到难度分数的映射
|
|||
|
'BBC': 1,
|
|||
|
'CET4': 2,
|
|||
|
'CET6': 3,
|
|||
|
'GRADUATE': 4,
|
|||
|
'OXFORD3000': 1,
|
|||
|
'TOEFL': 5,
|
|||
|
'IELTS': 5,
|
|||
|
'GRE': 7
|
|||
|
}
|
|||
|
|
|||
|
def get_word_level(self, word):
|
|||
|
"""获取单词难度分数"""
|
|||
|
if word in self._test:
|
|||
|
sources = self._test[word]
|
|||
|
word_levels = [
|
|||
|
self._source_levels[src]
|
|||
|
for src in sources
|
|||
|
if src in self._source_levels
|
|||
|
]
|
|||
|
if word_levels:
|
|||
|
# 使用最高分
|
|||
|
return max(word_levels)
|
|||
|
return 0 # 未知单词难度为0
|
|||
|
|
|||
|
|
|||
|
class UserVocabularyLevel(VocabularyLevelEstimator):
|
|||
|
def __init__(self, d, recent_count=3):
|
|||
|
self.d = d
|
|||
|
# 按时间戳排序(最新的在前)
|
|||
|
sorted_words = sorted(d.items(), key=lambda x: max(x[1]), reverse=True)
|
|||
|
# 取最近的单词(默认3个)
|
|||
|
self.word_lst = [word for word, _ in sorted_words[:recent_count]]
|
|||
|
|
|||
|
@property
|
|||
|
def level(self):
|
|||
|
if not self.word_lst:
|
|||
|
return 0.0
|
|||
|
|
|||
|
# 使用最高分
|
|||
|
max_score = 0
|
|||
|
for word in self.word_lst:
|
|||
|
score = self.get_word_level(word)
|
|||
|
if score > max_score:
|
|||
|
max_score = score
|
|||
|
return max_score
|
|||
|
|
|||
|
|
|||
|
class ArticleVocabularyLevel(VocabularyLevelEstimator):
|
|||
|
def __init__(self, content):
|
|||
|
self.content = content
|
|||
|
# 更智能的分词,处理连字符和缩写
|
|||
|
words = re.findall(r'\b[\w-]+\b', content.lower())
|
|||
|
|
|||
|
# 计算每个单词的频率和分数
|
|||
|
word_freq = defaultdict(int)
|
|||
|
word_scores = {}
|
|||
|
|
|||
|
for word in words:
|
|||
|
if word.isalpha():
|
|||
|
word_freq[word] += 1
|
|||
|
if word not in word_scores:
|
|||
|
word_scores[word] = self.get_word_level(word)
|
|||
|
|
|||
|
# 计算加权分数(频率 * 分数)
|
|||
|
weighted_scores = []
|
|||
|
for word, score in word_scores.items():
|
|||
|
if score > 0:
|
|||
|
weighted_scores.append((score * word_freq[word], score, word))
|
|||
|
|
|||
|
# 如果没有有效单词,直接返回
|
|||
|
if not weighted_scores:
|
|||
|
self.difficult_words = []
|
|||
|
return
|
|||
|
|
|||
|
# 按加权分数排序
|
|||
|
weighted_scores.sort(reverse=True)
|
|||
|
|
|||
|
# 只保留前20%的单词(至少5个,最多15个)
|
|||
|
num_top_words = max(5, min(15, len(weighted_scores) // 5))
|
|||
|
self.difficult_words = [score for _, score, _ in weighted_scores[:num_top_words]]
|
|||
|
|
|||
|
@property
|
|||
|
def level(self):
|
|||
|
if not self.difficult_words:
|
|||
|
return 0.0
|
|||
|
|
|||
|
# 使用最高分
|
|||
|
return max(self.difficult_words)
|
|||
|
|
|||
|
if __name__ == '__main__':
|
|||
|
d = load_record('frequency_mrlan85.pickle')
|
|||
|
print(d)
|
|||
|
user = UserVocabularyLevel(d)
|
|||
|
print(user.level) # level is a property
|
|||
|
article = ArticleVocabularyLevel('This is an interesting article')
|
|||
|
print(article.level)
|
|||
|
|
|||
|
|
|||
|
|