请老师看我们的vocabulary.py文件 #195
			
				
			
		
		
		
	| 
						 | 
				
			
			@ -0,0 +1,91 @@
 | 
			
		|||
'''
 | 
			
		||||
   Estimate a user's vocabulary level given his vocabulary data
 | 
			
		||||
   Estimate an English article's difficulty level given its content
 | 
			
		||||
   Preliminary design
 | 
			
		||||
 | 
			
		||||
   Hui, 2024-09-23
 | 
			
		||||
   Last upated: 2024-09-25, 2024-09-30
 | 
			
		||||
'''
 | 
			
		||||
 | 
			
		||||
import pickle
 | 
			
		||||
 | 
			
		||||
import nltk
 | 
			
		||||
 | 
			
		||||
DIFFICULTY_MAPPING = {
 | 
			
		||||
    'BBC': 2,       # 基础词汇
 | 
			
		||||
    'CET4': 3,      # 四级(大学英语)
 | 
			
		||||
    'CET6': 4,      # 六级
 | 
			
		||||
    'GRADUATE': 5,  # 考研词汇
 | 
			
		||||
    'IELTS': 6,     # 雅思
 | 
			
		||||
    'OXFORD3000': 4, # 牛津3000核心词
 | 
			
		||||
    'OXFORD5000': 7 # 牛津5000词
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def load_record(pickle_fname):
 | 
			
		||||
    with open(pickle_fname, 'rb') as f:
 | 
			
		||||
        d = pickle.load(f)
 | 
			
		||||
    return d
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class VocabularyLevelEstimator:
 | 
			
		||||
    _test = load_record('words_and_tests.p') # map a word to the sources where it appears
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def level(self):
 | 
			
		||||
        if not self.word_lst:  # 检查是否有有效词汇
 | 
			
		||||
            return 0.0  # 或根据需求返回默认值
 | 
			
		||||
        total = 0.0
 | 
			
		||||
        valid_words = 0
 | 
			
		||||
        for word in self.word_lst:
 | 
			
		||||
            if word in self._test:
 | 
			
		||||
                sources = self._test[word]
 | 
			
		||||
                total += max(DIFFICULTY_MAPPING.get(src, 0) for src in sources)
 | 
			
		||||
                valid_words += 1
 | 
			
		||||
        return total / valid_words if valid_words > 0 else 0.0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class UserVocabularyLevel(VocabularyLevelEstimator):
 | 
			
		||||
    def __init__(self, d, recent_n=3):
 | 
			
		||||
        self.d = d
 | 
			
		||||
        self.recent_n = recent_n
 | 
			
		||||
        # 按时间戳降序排序,取前recent_n个单词
 | 
			
		||||
        sorted_words = sorted(d.keys(), key=lambda word: d[word][-1], reverse=True)
 | 
			
		||||
        self.word_lst = sorted_words[:recent_n]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ArticleVocabularyLevel(VocabularyLevelEstimator):
 | 
			
		||||
    def __init__(self, content):
 | 
			
		||||
        self.content = content
 | 
			
		||||
        # 预处理:分词、小写、去标点、去停用词
 | 
			
		||||
        import re
 | 
			
		||||
        from nltk.corpus import stopwords
 | 
			
		||||
        nltk.download('stopwords')
 | 
			
		||||
        stop_words = set(stopwords.words('english'))
 | 
			
		||||
        words = re.findall(r'\b\w+\b', content.lower())
 | 
			
		||||
        self.word_lst = [word for word in words if word not in stop_words]
 | 
			
		||||
        # 按难度分筛选前10个最难的单词
 | 
			
		||||
        self.word_lst = sorted(
 | 
			
		||||
            self.word_lst,
 | 
			
		||||
            key=lambda w: self._get_difficulty(w),
 | 
			
		||||
            reverse=True
 | 
			
		||||
        )[:10]
 | 
			
		||||
 | 
			
		||||
    def _get_difficulty(self, word):
 | 
			
		||||
        if word in self._test:
 | 
			
		||||
            return max(DIFFICULTY_MAPPING.get(src, 0) for src in self._test[word])
 | 
			
		||||
        return 0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
    d = load_record('frequency_mrlan85.pickle')
 | 
			
		||||
    print(d)
 | 
			
		||||
    print("======================================================")
 | 
			
		||||
    user = UserVocabularyLevel(d)
 | 
			
		||||
    print(user.level) # level is a property
 | 
			
		||||
    print("======================================================")
 | 
			
		||||
    article = ArticleVocabularyLevel('This is an interesting article')
 | 
			
		||||
    print(article.level)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
		Loading…
	
		Reference in New Issue