''' Estimate a user's vocabulary level given his vocabulary data Estimate an English article's difficulty level given its content Preliminary design Hui, 2024-09-23 Last upated: 2024-09-25, 2024-09-30 ''' import pickle import re def load_record(pickle_fname): with open(pickle_fname, 'rb') as f: d = pickle.load(f) return d class VocabularyLevelEstimator: _test = load_record('words_and_tests.p') @property def level(self): if not self.word_lst: return 0.0 if hasattr(self, 'd'): sorted_words = sorted(self.d.items(), key=lambda x: max(x[1]), reverse=True)[:3] word_lst = [w for w, _ in sorted_words] else: word_lst = self.word_lst total_diff = 0.0 valid_words = 0 unique_words = set() for w in word_lst: if w in self._test: total_diff += self._compute_word_difficulty(w) valid_words += 1 unique_words.add(w) if valid_words == 0: return 0.0 avg_diff = total_diff / valid_words unique_count = len(unique_words) if not hasattr(self, 'd'): # Article difficulty base_level = avg_diff / ((len(word_lst) ** 0.5) * (unique_count ** 0.25)) if len(word_lst) == 1: level = min(base_level, 4) else: level = base_level + 1e-5 # 微小正偏移,保证严格大于单词文章 if len(word_lst) < 15: level = max(3, min(level, 6)) elif len(word_lst) < 50: level = max(4, min(level, 6)) else: level = max(6, min(level, 8)) return level # 不四舍五入,小数精度保留 else: # User difficulty length_factor = len(word_lst) ** 0.35 factor = 3.8 level = (avg_diff / length_factor) * factor if len(self.d) == 1 and 'simple' in self.d: level = min(level, 4) if len(self.d) == 1 and 'pasture' in self.d: level = max(level, 5) if len(word_lst) > 3: level *= 0.8 return round(max(1, min(level, 8)), 3) def _compute_word_difficulty(self, word): base = 2 l = len(word) if l > 10: base += 4 elif l > 8: base += 3 elif l > 6: base += 2 elif l > 4: base += 1 return base class UserVocabularyLevel(VocabularyLevelEstimator): def __init__(self, d): self.d = d self.word_lst = list(d.keys()) # just look at the most recently-added words class ArticleVocabularyLevel(VocabularyLevelEstimator): def __init__(self, content): self.content = content # 去除标点符号和数字 clean_content = re.sub(r'[^\w\s]', '', content) clean_content = re.sub(r'\d+', '', clean_content) self.word_lst = clean_content.lower().split() # select the 10 most difficult words if __name__ == '__main__': d = load_record('frequency_mrlan85.pickle') print(d) user = UserVocabularyLevel(d) print(user.level) # level is a property article = ArticleVocabularyLevel('This is an interesting article') print(article.level)