请老师看我们的vocabulary.py文件 #195
			
				
			
		
		
		
	| 
						 | 
					@ -0,0 +1,91 @@
 | 
				
			||||||
 | 
					'''
 | 
				
			||||||
 | 
					   Estimate a user's vocabulary level given his vocabulary data
 | 
				
			||||||
 | 
					   Estimate an English article's difficulty level given its content
 | 
				
			||||||
 | 
					   Preliminary design
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					   Hui, 2024-09-23
 | 
				
			||||||
 | 
					   Last upated: 2024-09-25, 2024-09-30
 | 
				
			||||||
 | 
					'''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import pickle
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import nltk
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DIFFICULTY_MAPPING = {
 | 
				
			||||||
 | 
					    'BBC': 2,       # 基础词汇
 | 
				
			||||||
 | 
					    'CET4': 3,      # 四级(大学英语)
 | 
				
			||||||
 | 
					    'CET6': 4,      # 六级
 | 
				
			||||||
 | 
					    'GRADUATE': 5,  # 考研词汇
 | 
				
			||||||
 | 
					    'IELTS': 6,     # 雅思
 | 
				
			||||||
 | 
					    'OXFORD3000': 4, # 牛津3000核心词
 | 
				
			||||||
 | 
					    'OXFORD5000': 7 # 牛津5000词
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def load_record(pickle_fname):
 | 
				
			||||||
 | 
					    with open(pickle_fname, 'rb') as f:
 | 
				
			||||||
 | 
					        d = pickle.load(f)
 | 
				
			||||||
 | 
					    return d
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class VocabularyLevelEstimator:
 | 
				
			||||||
 | 
					    _test = load_record('words_and_tests.p') # map a word to the sources where it appears
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @property
 | 
				
			||||||
 | 
					    def level(self):
 | 
				
			||||||
 | 
					        if not self.word_lst:  # 检查是否有有效词汇
 | 
				
			||||||
 | 
					            return 0.0  # 或根据需求返回默认值
 | 
				
			||||||
 | 
					        total = 0.0
 | 
				
			||||||
 | 
					        valid_words = 0
 | 
				
			||||||
 | 
					        for word in self.word_lst:
 | 
				
			||||||
 | 
					            if word in self._test:
 | 
				
			||||||
 | 
					                sources = self._test[word]
 | 
				
			||||||
 | 
					                total += max(DIFFICULTY_MAPPING.get(src, 0) for src in sources)
 | 
				
			||||||
 | 
					                valid_words += 1
 | 
				
			||||||
 | 
					        return total / valid_words if valid_words > 0 else 0.0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class UserVocabularyLevel(VocabularyLevelEstimator):
 | 
				
			||||||
 | 
					    def __init__(self, d, recent_n=3):
 | 
				
			||||||
 | 
					        self.d = d
 | 
				
			||||||
 | 
					        self.recent_n = recent_n
 | 
				
			||||||
 | 
					        # 按时间戳降序排序,取前recent_n个单词
 | 
				
			||||||
 | 
					        sorted_words = sorted(d.keys(), key=lambda word: d[word][-1], reverse=True)
 | 
				
			||||||
 | 
					        self.word_lst = sorted_words[:recent_n]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class ArticleVocabularyLevel(VocabularyLevelEstimator):
 | 
				
			||||||
 | 
					    def __init__(self, content):
 | 
				
			||||||
 | 
					        self.content = content
 | 
				
			||||||
 | 
					        # 预处理:分词、小写、去标点、去停用词
 | 
				
			||||||
 | 
					        import re
 | 
				
			||||||
 | 
					        from nltk.corpus import stopwords
 | 
				
			||||||
 | 
					        nltk.download('stopwords')
 | 
				
			||||||
 | 
					        stop_words = set(stopwords.words('english'))
 | 
				
			||||||
 | 
					        words = re.findall(r'\b\w+\b', content.lower())
 | 
				
			||||||
 | 
					        self.word_lst = [word for word in words if word not in stop_words]
 | 
				
			||||||
 | 
					        # 按难度分筛选前10个最难的单词
 | 
				
			||||||
 | 
					        self.word_lst = sorted(
 | 
				
			||||||
 | 
					            self.word_lst,
 | 
				
			||||||
 | 
					            key=lambda w: self._get_difficulty(w),
 | 
				
			||||||
 | 
					            reverse=True
 | 
				
			||||||
 | 
					        )[:10]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _get_difficulty(self, word):
 | 
				
			||||||
 | 
					        if word in self._test:
 | 
				
			||||||
 | 
					            return max(DIFFICULTY_MAPPING.get(src, 0) for src in self._test[word])
 | 
				
			||||||
 | 
					        return 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if __name__ == '__main__':
 | 
				
			||||||
 | 
					    d = load_record('frequency_mrlan85.pickle')
 | 
				
			||||||
 | 
					    print(d)
 | 
				
			||||||
 | 
					    print("======================================================")
 | 
				
			||||||
 | 
					    user = UserVocabularyLevel(d)
 | 
				
			||||||
 | 
					    print(user.level) # level is a property
 | 
				
			||||||
 | 
					    print("======================================================")
 | 
				
			||||||
 | 
					    article = ArticleVocabularyLevel('This is an interesting article')
 | 
				
			||||||
 | 
					    print(article.level)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue