EnglishPal/app/vocabulary.py

92 lines
2.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

'''
Estimate a user's vocabulary level given his vocabulary data
Estimate an English article's difficulty level given its content
Preliminary design
Hui, 2024-09-23
Last upated: 2024-09-25, 2024-09-30
'''
import pickle
import nltk
DIFFICULTY_MAPPING = {
'BBC': 2, # 基础词汇
'CET4': 3, # 四级(大学英语)
'CET6': 4, # 六级
'GRADUATE': 5, # 考研词汇
'IELTS': 6, # 雅思
'OXFORD3000': 4, # 牛津3000核心词
'OXFORD5000': 7 # 牛津5000词
}
def load_record(pickle_fname):
with open(pickle_fname, 'rb') as f:
d = pickle.load(f)
return d
class VocabularyLevelEstimator:
_test = load_record('words_and_tests.p') # map a word to the sources where it appears
@property
def level(self):
if not self.word_lst: # 检查是否有有效词汇
return 0.0 # 或根据需求返回默认值
total = 0.0
valid_words = 0
for word in self.word_lst:
if word in self._test:
sources = self._test[word]
total += max(DIFFICULTY_MAPPING.get(src, 0) for src in sources)
valid_words += 1
return total / valid_words if valid_words > 0 else 0.0
class UserVocabularyLevel(VocabularyLevelEstimator):
def __init__(self, d, recent_n=3):
self.d = d
self.recent_n = recent_n
# 按时间戳降序排序取前recent_n个单词
sorted_words = sorted(d.keys(), key=lambda word: d[word][-1], reverse=True)
self.word_lst = sorted_words[:recent_n]
class ArticleVocabularyLevel(VocabularyLevelEstimator):
def __init__(self, content):
self.content = content
# 预处理:分词、小写、去标点、去停用词
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
words = re.findall(r'\b\w+\b', content.lower())
self.word_lst = [word for word in words if word not in stop_words]
# 按难度分筛选前10个最难的单词
self.word_lst = sorted(
self.word_lst,
key=lambda w: self._get_difficulty(w),
reverse=True
)[:10]
def _get_difficulty(self, word):
if word in self._test:
return max(DIFFICULTY_MAPPING.get(src, 0) for src in self._test[word])
return 0
if __name__ == '__main__':
d = load_record('frequency_mrlan85.pickle')
print(d)
print("======================================================")
user = UserVocabularyLevel(d)
print(user.level) # level is a property
print("======================================================")
article = ArticleVocabularyLevel('This is an interesting article')
print(article.level)