请老师看我们的vocabulary.py文件 #195
|
@ -0,0 +1,91 @@
|
||||||
|
'''
|
||||||
|
Estimate a user's vocabulary level given his vocabulary data
|
||||||
|
Estimate an English article's difficulty level given its content
|
||||||
|
Preliminary design
|
||||||
|
|
||||||
|
Hui, 2024-09-23
|
||||||
|
Last upated: 2024-09-25, 2024-09-30
|
||||||
|
'''
|
||||||
|
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
import nltk
|
||||||
|
|
||||||
|
DIFFICULTY_MAPPING = {
|
||||||
|
'BBC': 2, # 基础词汇
|
||||||
|
'CET4': 3, # 四级(大学英语)
|
||||||
|
'CET6': 4, # 六级
|
||||||
|
'GRADUATE': 5, # 考研词汇
|
||||||
|
'IELTS': 6, # 雅思
|
||||||
|
'OXFORD3000': 4, # 牛津3000核心词
|
||||||
|
'OXFORD5000': 7 # 牛津5000词
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def load_record(pickle_fname):
|
||||||
|
with open(pickle_fname, 'rb') as f:
|
||||||
|
d = pickle.load(f)
|
||||||
|
return d
|
||||||
|
|
||||||
|
|
||||||
|
class VocabularyLevelEstimator:
|
||||||
|
_test = load_record('words_and_tests.p') # map a word to the sources where it appears
|
||||||
|
|
||||||
|
@property
|
||||||
|
def level(self):
|
||||||
|
if not self.word_lst: # 检查是否有有效词汇
|
||||||
|
return 0.0 # 或根据需求返回默认值
|
||||||
|
total = 0.0
|
||||||
|
valid_words = 0
|
||||||
|
for word in self.word_lst:
|
||||||
|
if word in self._test:
|
||||||
|
sources = self._test[word]
|
||||||
|
total += max(DIFFICULTY_MAPPING.get(src, 0) for src in sources)
|
||||||
|
valid_words += 1
|
||||||
|
return total / valid_words if valid_words > 0 else 0.0
|
||||||
|
|
||||||
|
|
||||||
|
class UserVocabularyLevel(VocabularyLevelEstimator):
|
||||||
|
def __init__(self, d, recent_n=3):
|
||||||
|
self.d = d
|
||||||
|
self.recent_n = recent_n
|
||||||
|
# 按时间戳降序排序,取前recent_n个单词
|
||||||
|
sorted_words = sorted(d.keys(), key=lambda word: d[word][-1], reverse=True)
|
||||||
|
self.word_lst = sorted_words[:recent_n]
|
||||||
|
|
||||||
|
|
||||||
|
class ArticleVocabularyLevel(VocabularyLevelEstimator):
|
||||||
|
def __init__(self, content):
|
||||||
|
self.content = content
|
||||||
|
# 预处理:分词、小写、去标点、去停用词
|
||||||
|
import re
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
nltk.download('stopwords')
|
||||||
|
stop_words = set(stopwords.words('english'))
|
||||||
|
words = re.findall(r'\b\w+\b', content.lower())
|
||||||
|
self.word_lst = [word for word in words if word not in stop_words]
|
||||||
|
# 按难度分筛选前10个最难的单词
|
||||||
|
self.word_lst = sorted(
|
||||||
|
self.word_lst,
|
||||||
|
key=lambda w: self._get_difficulty(w),
|
||||||
|
reverse=True
|
||||||
|
)[:10]
|
||||||
|
|
||||||
|
def _get_difficulty(self, word):
|
||||||
|
if word in self._test:
|
||||||
|
return max(DIFFICULTY_MAPPING.get(src, 0) for src in self._test[word])
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
d = load_record('frequency_mrlan85.pickle')
|
||||||
|
print(d)
|
||||||
|
print("======================================================")
|
||||||
|
user = UserVocabularyLevel(d)
|
||||||
|
print(user.level) # level is a property
|
||||||
|
print("======================================================")
|
||||||
|
article = ArticleVocabularyLevel('This is an interesting article')
|
||||||
|
print(article.level)
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue