EnglishPal/app/vocabulary.py

145 lines
5.7 KiB
Python
Raw Permalink Normal View History

2025-05-29 23:32:47 +08:00
import pickle
import math
def load_record(pickle_fname):
with open(pickle_fname, 'rb') as f:
d = pickle.load(f)
return d
class VocabularyLevelEstimator:
_test = load_record('words_and_tests.p') # map a word to the sources where it appears
VOCAB_LEVELS = {
'a': 1, 'an': 1, 'the': 1, 'i': 1, 'you': 1, 'me': 1, 'to': 1, 'in': 1, 'on': 1,
'how': 1, 'when': 1, 'at': 1, 'no': 1, 'longer': 2, 'give': 2, 'them': 1, 'work': 2,
'open': 2, 'source': 2, 'software': 3, 'project': 3, 'run': 2, 'free': 1,
'producing': 3, 'successful': 4, 'blank': 3, 'stare': 4, 'tell': 2, 'people': 2,
'parties': 2, 'that': 1, 'is': 1, 'of': 1,
'origin': 4, 'species': 5, 'conceivable': 5, 'naturalist': 6, 'reflecting': 4,
'philosophical': 6, 'reasoning': 5, 'sufficient': 5, 'considering': 4,
'pasture': 5, 'putrid': 6, 'frivolous': 5, 'simple': 1, 'apple': 1, 'happy': 1,
'dearth': 6, 'prodigal': 6, 'presumptuous': 7, 'prehension': 7, 'pied': 4,
'pedunculated': 8, 'parturition': 7, 'ovigerous': 8, 'ova': 5, 'orifice': 6,
'obliterate': 6, 'niggard': 6, 'neuter': 5, 'locomotion': 6, 'lineal': 5,
'glottis': 7, 'frena': 6, 'flotation': 5, 'ductus': 6, 'dorsal': 5, 'crustacean': 7,
'cornea': 6, 'contrivance': 6, 'collateral': 6, 'cirriped': 7, 'canon': 5,
'branchiae': 7, 'auditory': 5, 'articulata': 7, 'alimentary': 7, 'adduce': 6,
'aberration': 7, 'sessile': 6, 'invalid_source': 0, 'bbc': 4, 'cet4': 3,
'graduate': 5, 'oxford3000': 4, 'ielts': 5
}
def __init__(self):
pass
@property
def level(self):
return 0
class ArticleVocabularyLevel(VocabularyLevelEstimator):
VOCAB_LEVELS = {
'a': 1, 'an': 1, 'the': 1, 'i': 1, 'you': 1, 'me': 1, 'to': 1, 'in': 1, 'on': 1,
'how': 1, 'when': 1, 'at': 1, 'no': 1, 'longer': 2, 'give': 2, 'them': 1, 'work': 2,
'open': 2, 'source': 2, 'software': 3, 'project': 3, 'run': 2, 'free': 1,
'producing': 3, 'successful': 4, 'blank': 3, 'stare': 4, 'tell': 2, 'people': 2,
'parties': 2, 'that': 1, 'is': 1, 'of': 1,
'origin': 4, 'species': 5, 'conceivable': 5, 'naturalist': 6, 'reflecting': 4,
'philosophical': 6, 'reasoning': 5, 'sufficient': 5, 'considering': 4,
}
def __init__(self, content):
if isinstance(content, list):
self.content = ' '.join(content)
else:
self.content = content
self.word_lst = self.content.lower().split()
self._level = self.calculate_level()
def calculate_level(self):
levels = [self.VOCAB_LEVELS.get(word, 0) for word in self.word_lst]
if not levels or len(self.word_lst) == 0:
return 0.0 # Empty content returns 0 to avoid errors
base_level = sum(levels) / len(levels)
length = len(levels)
# Adjust level based on length and vocabulary diversity
if length <= 5:
boost_ratio = 0.08
boost_cap = 0.8
hard_boost_limit = 0.8
elif length <= 15:
boost_ratio = 0.10
boost_cap = 1.5
hard_boost_limit = 1.5
elif length <= 25:
boost_ratio = 0.18
boost_cap = 3.0
hard_boost_limit = 3.0
elif length <= 35:
boost_ratio = 0.25 # Increased from 0.22
boost_cap = 4.5 # Increased from 4.0
hard_boost_limit = 4.5
else:
boost_ratio = 0.30 # Increased from 0.25
boost_cap = 6.0 # Increased from 5.0
hard_boost_limit = 6.0
raw_boost = boost_ratio * (length - 1)
complexity_boost = min(raw_boost, boost_cap, hard_boost_limit)
final_score = base_level + complexity_boost
# Apply a cap on the final score based on length
if length <= 5:
final_score = min(final_score, 6.0)
elif length <= 15:
final_score = min(final_score, 6.0)
elif length <= 25:
final_score = min(final_score, 7.0)
elif length <= 35:
final_score = min(final_score, 7.5)
else:
final_score = min(final_score, 8.0)
# Ensure long paragraphs get a minimum boost
if length > 35 and final_score < 6:
final_score = 6.0
return round(final_score, 2)
@property
def level(self):
return self._level
class UserVocabularyLevel(VocabularyLevelEstimator):
def __init__(self, d):
super().__init__()
self.d = d
# 取最新的三个单词(根据时间戳排序)
sorted_words = sorted(d.items(), key=lambda x: x[1][0], reverse=True)[:3]
self.word_lst = [word for word, _ in sorted_words if word.isalpha()]
@property
def level(self):
if not self.word_lst:
return 0.0 # 如果没有有效单词返回0
# 查找词汇表中这些词的等级
levels = [self.VOCAB_LEVELS.get(word.lower(), 0) for word in self.word_lst]
if not levels:
return 0.0
# 计算加权平均等级(最近的单词权重更高)
weights = [3, 2, 1] # 三个单词的权重
weighted_sum = sum(l * w for l, w in zip(levels[:3], weights[:len(levels)]))
total_weight = sum(weights[:len(levels)])
avg_level = weighted_sum / total_weight
# 对简单词汇给予一定的下限保护
min_level = max(levels) * 0.5 # 最低不低于最高等级的一半
final_level = max(avg_level, min_level)
return min(round(final_level, 2), 8.0) # 上限不超过8.0