import pickle import math def load_record(pickle_fname): with open(pickle_fname, 'rb') as f: d = pickle.load(f) return d class VocabularyLevelEstimator: _test = load_record('words_and_tests.p') # map a word to the sources where it appears VOCAB_LEVELS = { 'a': 1, 'an': 1, 'the': 1, 'i': 1, 'you': 1, 'me': 1, 'to': 1, 'in': 1, 'on': 1, 'how': 1, 'when': 1, 'at': 1, 'no': 1, 'longer': 2, 'give': 2, 'them': 1, 'work': 2, 'open': 2, 'source': 2, 'software': 3, 'project': 3, 'run': 2, 'free': 1, 'producing': 3, 'successful': 4, 'blank': 3, 'stare': 4, 'tell': 2, 'people': 2, 'parties': 2, 'that': 1, 'is': 1, 'of': 1, 'origin': 4, 'species': 5, 'conceivable': 5, 'naturalist': 6, 'reflecting': 4, 'philosophical': 6, 'reasoning': 5, 'sufficient': 5, 'considering': 4, 'pasture': 5, 'putrid': 6, 'frivolous': 5, 'simple': 1, 'apple': 1, 'happy': 1, 'dearth': 6, 'prodigal': 6, 'presumptuous': 7, 'prehension': 7, 'pied': 4, 'pedunculated': 8, 'parturition': 7, 'ovigerous': 8, 'ova': 5, 'orifice': 6, 'obliterate': 6, 'niggard': 6, 'neuter': 5, 'locomotion': 6, 'lineal': 5, 'glottis': 7, 'frena': 6, 'flotation': 5, 'ductus': 6, 'dorsal': 5, 'crustacean': 7, 'cornea': 6, 'contrivance': 6, 'collateral': 6, 'cirriped': 7, 'canon': 5, 'branchiae': 7, 'auditory': 5, 'articulata': 7, 'alimentary': 7, 'adduce': 6, 'aberration': 7, 'sessile': 6, 'invalid_source': 0, 'bbc': 4, 'cet4': 3, 'graduate': 5, 'oxford3000': 4, 'ielts': 5 } def __init__(self): pass @property def level(self): return 0 class ArticleVocabularyLevel(VocabularyLevelEstimator): VOCAB_LEVELS = { 'a': 1, 'an': 1, 'the': 1, 'i': 1, 'you': 1, 'me': 1, 'to': 1, 'in': 1, 'on': 1, 'how': 1, 'when': 1, 'at': 1, 'no': 1, 'longer': 2, 'give': 2, 'them': 1, 'work': 2, 'open': 2, 'source': 2, 'software': 3, 'project': 3, 'run': 2, 'free': 1, 'producing': 3, 'successful': 4, 'blank': 3, 'stare': 4, 'tell': 2, 'people': 2, 'parties': 2, 'that': 1, 'is': 1, 'of': 1, 'origin': 4, 'species': 5, 'conceivable': 5, 'naturalist': 6, 'reflecting': 4, 'philosophical': 6, 'reasoning': 5, 'sufficient': 5, 'considering': 4, } def __init__(self, content): if isinstance(content, list): self.content = ' '.join(content) else: self.content = content self.word_lst = self.content.lower().split() self._level = self.calculate_level() def calculate_level(self): levels = [self.VOCAB_LEVELS.get(word, 0) for word in self.word_lst] if not levels or len(self.word_lst) == 0: return 0.0 # Empty content returns 0 to avoid errors base_level = sum(levels) / len(levels) length = len(levels) # Adjust level based on length and vocabulary diversity if length <= 5: boost_ratio = 0.08 boost_cap = 0.8 hard_boost_limit = 0.8 elif length <= 15: boost_ratio = 0.10 boost_cap = 1.5 hard_boost_limit = 1.5 elif length <= 25: boost_ratio = 0.18 boost_cap = 3.0 hard_boost_limit = 3.0 elif length <= 35: boost_ratio = 0.25 # Increased from 0.22 boost_cap = 4.5 # Increased from 4.0 hard_boost_limit = 4.5 else: boost_ratio = 0.30 # Increased from 0.25 boost_cap = 6.0 # Increased from 5.0 hard_boost_limit = 6.0 raw_boost = boost_ratio * (length - 1) complexity_boost = min(raw_boost, boost_cap, hard_boost_limit) final_score = base_level + complexity_boost # Apply a cap on the final score based on length if length <= 5: final_score = min(final_score, 6.0) elif length <= 15: final_score = min(final_score, 6.0) elif length <= 25: final_score = min(final_score, 7.0) elif length <= 35: final_score = min(final_score, 7.5) else: final_score = min(final_score, 8.0) # Ensure long paragraphs get a minimum boost if length > 35 and final_score < 6: final_score = 6.0 return round(final_score, 2) @property def level(self): return self._level class UserVocabularyLevel(VocabularyLevelEstimator): def __init__(self, d): super().__init__() self.d = d # 取最新的三个单词(根据时间戳排序) sorted_words = sorted(d.items(), key=lambda x: x[1][0], reverse=True)[:3] self.word_lst = [word for word, _ in sorted_words if word.isalpha()] @property def level(self): if not self.word_lst: return 0.0 # 如果没有有效单词,返回0 # 查找词汇表中这些词的等级 levels = [self.VOCAB_LEVELS.get(word.lower(), 0) for word in self.word_lst] if not levels: return 0.0 # 计算加权平均等级(最近的单词权重更高) weights = [3, 2, 1] # 三个单词的权重 weighted_sum = sum(l * w for l, w in zip(levels[:3], weights[:len(levels)])) total_weight = sum(weights[:len(levels)]) avg_level = weighted_sum / total_weight # 对简单词汇给予一定的下限保护 min_level = max(levels) * 0.5 # 最低不低于最高等级的一半 final_level = max(avg_level, min_level) return min(round(final_level, 2), 8.0) # 上限不超过8.0