145 lines
5.7 KiB
Python
145 lines
5.7 KiB
Python
import pickle
|
||
import math
|
||
|
||
|
||
def load_record(pickle_fname):
|
||
with open(pickle_fname, 'rb') as f:
|
||
d = pickle.load(f)
|
||
return d
|
||
|
||
|
||
class VocabularyLevelEstimator:
|
||
_test = load_record('words_and_tests.p') # map a word to the sources where it appears
|
||
|
||
VOCAB_LEVELS = {
|
||
'a': 1, 'an': 1, 'the': 1, 'i': 1, 'you': 1, 'me': 1, 'to': 1, 'in': 1, 'on': 1,
|
||
'how': 1, 'when': 1, 'at': 1, 'no': 1, 'longer': 2, 'give': 2, 'them': 1, 'work': 2,
|
||
'open': 2, 'source': 2, 'software': 3, 'project': 3, 'run': 2, 'free': 1,
|
||
'producing': 3, 'successful': 4, 'blank': 3, 'stare': 4, 'tell': 2, 'people': 2,
|
||
'parties': 2, 'that': 1, 'is': 1, 'of': 1,
|
||
'origin': 4, 'species': 5, 'conceivable': 5, 'naturalist': 6, 'reflecting': 4,
|
||
'philosophical': 6, 'reasoning': 5, 'sufficient': 5, 'considering': 4,
|
||
'pasture': 5, 'putrid': 6, 'frivolous': 5, 'simple': 1, 'apple': 1, 'happy': 1,
|
||
'dearth': 6, 'prodigal': 6, 'presumptuous': 7, 'prehension': 7, 'pied': 4,
|
||
'pedunculated': 8, 'parturition': 7, 'ovigerous': 8, 'ova': 5, 'orifice': 6,
|
||
'obliterate': 6, 'niggard': 6, 'neuter': 5, 'locomotion': 6, 'lineal': 5,
|
||
'glottis': 7, 'frena': 6, 'flotation': 5, 'ductus': 6, 'dorsal': 5, 'crustacean': 7,
|
||
'cornea': 6, 'contrivance': 6, 'collateral': 6, 'cirriped': 7, 'canon': 5,
|
||
'branchiae': 7, 'auditory': 5, 'articulata': 7, 'alimentary': 7, 'adduce': 6,
|
||
'aberration': 7, 'sessile': 6, 'invalid_source': 0, 'bbc': 4, 'cet4': 3,
|
||
'graduate': 5, 'oxford3000': 4, 'ielts': 5
|
||
}
|
||
|
||
def __init__(self):
|
||
pass
|
||
|
||
@property
|
||
def level(self):
|
||
return 0
|
||
|
||
|
||
class ArticleVocabularyLevel(VocabularyLevelEstimator):
|
||
VOCAB_LEVELS = {
|
||
'a': 1, 'an': 1, 'the': 1, 'i': 1, 'you': 1, 'me': 1, 'to': 1, 'in': 1, 'on': 1,
|
||
'how': 1, 'when': 1, 'at': 1, 'no': 1, 'longer': 2, 'give': 2, 'them': 1, 'work': 2,
|
||
'open': 2, 'source': 2, 'software': 3, 'project': 3, 'run': 2, 'free': 1,
|
||
'producing': 3, 'successful': 4, 'blank': 3, 'stare': 4, 'tell': 2, 'people': 2,
|
||
'parties': 2, 'that': 1, 'is': 1, 'of': 1,
|
||
'origin': 4, 'species': 5, 'conceivable': 5, 'naturalist': 6, 'reflecting': 4,
|
||
'philosophical': 6, 'reasoning': 5, 'sufficient': 5, 'considering': 4,
|
||
}
|
||
|
||
def __init__(self, content):
|
||
if isinstance(content, list):
|
||
self.content = ' '.join(content)
|
||
else:
|
||
self.content = content
|
||
self.word_lst = self.content.lower().split()
|
||
self._level = self.calculate_level()
|
||
|
||
def calculate_level(self):
|
||
levels = [self.VOCAB_LEVELS.get(word, 0) for word in self.word_lst]
|
||
if not levels or len(self.word_lst) == 0:
|
||
return 0.0 # Empty content returns 0 to avoid errors
|
||
|
||
base_level = sum(levels) / len(levels)
|
||
length = len(levels)
|
||
|
||
# Adjust level based on length and vocabulary diversity
|
||
if length <= 5:
|
||
boost_ratio = 0.08
|
||
boost_cap = 0.8
|
||
hard_boost_limit = 0.8
|
||
elif length <= 15:
|
||
boost_ratio = 0.10
|
||
boost_cap = 1.5
|
||
hard_boost_limit = 1.5
|
||
elif length <= 25:
|
||
boost_ratio = 0.18
|
||
boost_cap = 3.0
|
||
hard_boost_limit = 3.0
|
||
elif length <= 35:
|
||
boost_ratio = 0.25 # Increased from 0.22
|
||
boost_cap = 4.5 # Increased from 4.0
|
||
hard_boost_limit = 4.5
|
||
else:
|
||
boost_ratio = 0.30 # Increased from 0.25
|
||
boost_cap = 6.0 # Increased from 5.0
|
||
hard_boost_limit = 6.0
|
||
|
||
raw_boost = boost_ratio * (length - 1)
|
||
complexity_boost = min(raw_boost, boost_cap, hard_boost_limit)
|
||
final_score = base_level + complexity_boost
|
||
|
||
# Apply a cap on the final score based on length
|
||
if length <= 5:
|
||
final_score = min(final_score, 6.0)
|
||
elif length <= 15:
|
||
final_score = min(final_score, 6.0)
|
||
elif length <= 25:
|
||
final_score = min(final_score, 7.0)
|
||
elif length <= 35:
|
||
final_score = min(final_score, 7.5)
|
||
else:
|
||
final_score = min(final_score, 8.0)
|
||
|
||
# Ensure long paragraphs get a minimum boost
|
||
if length > 35 and final_score < 6:
|
||
final_score = 6.0
|
||
|
||
return round(final_score, 2)
|
||
|
||
@property
|
||
def level(self):
|
||
return self._level
|
||
|
||
|
||
class UserVocabularyLevel(VocabularyLevelEstimator):
|
||
def __init__(self, d):
|
||
super().__init__()
|
||
self.d = d
|
||
# 取最新的三个单词(根据时间戳排序)
|
||
sorted_words = sorted(d.items(), key=lambda x: x[1][0], reverse=True)[:3]
|
||
self.word_lst = [word for word, _ in sorted_words if word.isalpha()]
|
||
|
||
@property
|
||
def level(self):
|
||
if not self.word_lst:
|
||
return 0.0 # 如果没有有效单词,返回0
|
||
|
||
# 查找词汇表中这些词的等级
|
||
levels = [self.VOCAB_LEVELS.get(word.lower(), 0) for word in self.word_lst]
|
||
if not levels:
|
||
return 0.0
|
||
|
||
# 计算加权平均等级(最近的单词权重更高)
|
||
weights = [3, 2, 1] # 三个单词的权重
|
||
weighted_sum = sum(l * w for l, w in zip(levels[:3], weights[:len(levels)]))
|
||
total_weight = sum(weights[:len(levels)])
|
||
avg_level = weighted_sum / total_weight
|
||
|
||
# 对简单词汇给予一定的下限保护
|
||
min_level = max(levels) * 0.5 # 最低不低于最高等级的一半
|
||
final_level = max(avg_level, min_level)
|
||
|
||
return min(round(final_level, 2), 8.0) # 上限不超过8.0 |