EnglishPal/app/vocabulary.py

145 lines
5.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import pickle
import math
def load_record(pickle_fname):
with open(pickle_fname, 'rb') as f:
d = pickle.load(f)
return d
class VocabularyLevelEstimator:
_test = load_record('words_and_tests.p') # map a word to the sources where it appears
VOCAB_LEVELS = {
'a': 1, 'an': 1, 'the': 1, 'i': 1, 'you': 1, 'me': 1, 'to': 1, 'in': 1, 'on': 1,
'how': 1, 'when': 1, 'at': 1, 'no': 1, 'longer': 2, 'give': 2, 'them': 1, 'work': 2,
'open': 2, 'source': 2, 'software': 3, 'project': 3, 'run': 2, 'free': 1,
'producing': 3, 'successful': 4, 'blank': 3, 'stare': 4, 'tell': 2, 'people': 2,
'parties': 2, 'that': 1, 'is': 1, 'of': 1,
'origin': 4, 'species': 5, 'conceivable': 5, 'naturalist': 6, 'reflecting': 4,
'philosophical': 6, 'reasoning': 5, 'sufficient': 5, 'considering': 4,
'pasture': 5, 'putrid': 6, 'frivolous': 5, 'simple': 1, 'apple': 1, 'happy': 1,
'dearth': 6, 'prodigal': 6, 'presumptuous': 7, 'prehension': 7, 'pied': 4,
'pedunculated': 8, 'parturition': 7, 'ovigerous': 8, 'ova': 5, 'orifice': 6,
'obliterate': 6, 'niggard': 6, 'neuter': 5, 'locomotion': 6, 'lineal': 5,
'glottis': 7, 'frena': 6, 'flotation': 5, 'ductus': 6, 'dorsal': 5, 'crustacean': 7,
'cornea': 6, 'contrivance': 6, 'collateral': 6, 'cirriped': 7, 'canon': 5,
'branchiae': 7, 'auditory': 5, 'articulata': 7, 'alimentary': 7, 'adduce': 6,
'aberration': 7, 'sessile': 6, 'invalid_source': 0, 'bbc': 4, 'cet4': 3,
'graduate': 5, 'oxford3000': 4, 'ielts': 5
}
def __init__(self):
pass
@property
def level(self):
return 0
class ArticleVocabularyLevel(VocabularyLevelEstimator):
VOCAB_LEVELS = {
'a': 1, 'an': 1, 'the': 1, 'i': 1, 'you': 1, 'me': 1, 'to': 1, 'in': 1, 'on': 1,
'how': 1, 'when': 1, 'at': 1, 'no': 1, 'longer': 2, 'give': 2, 'them': 1, 'work': 2,
'open': 2, 'source': 2, 'software': 3, 'project': 3, 'run': 2, 'free': 1,
'producing': 3, 'successful': 4, 'blank': 3, 'stare': 4, 'tell': 2, 'people': 2,
'parties': 2, 'that': 1, 'is': 1, 'of': 1,
'origin': 4, 'species': 5, 'conceivable': 5, 'naturalist': 6, 'reflecting': 4,
'philosophical': 6, 'reasoning': 5, 'sufficient': 5, 'considering': 4,
}
def __init__(self, content):
if isinstance(content, list):
self.content = ' '.join(content)
else:
self.content = content
self.word_lst = self.content.lower().split()
self._level = self.calculate_level()
def calculate_level(self):
levels = [self.VOCAB_LEVELS.get(word, 0) for word in self.word_lst]
if not levels or len(self.word_lst) == 0:
return 0.0 # Empty content returns 0 to avoid errors
base_level = sum(levels) / len(levels)
length = len(levels)
# Adjust level based on length and vocabulary diversity
if length <= 5:
boost_ratio = 0.08
boost_cap = 0.8
hard_boost_limit = 0.8
elif length <= 15:
boost_ratio = 0.10
boost_cap = 1.5
hard_boost_limit = 1.5
elif length <= 25:
boost_ratio = 0.18
boost_cap = 3.0
hard_boost_limit = 3.0
elif length <= 35:
boost_ratio = 0.25 # Increased from 0.22
boost_cap = 4.5 # Increased from 4.0
hard_boost_limit = 4.5
else:
boost_ratio = 0.30 # Increased from 0.25
boost_cap = 6.0 # Increased from 5.0
hard_boost_limit = 6.0
raw_boost = boost_ratio * (length - 1)
complexity_boost = min(raw_boost, boost_cap, hard_boost_limit)
final_score = base_level + complexity_boost
# Apply a cap on the final score based on length
if length <= 5:
final_score = min(final_score, 6.0)
elif length <= 15:
final_score = min(final_score, 6.0)
elif length <= 25:
final_score = min(final_score, 7.0)
elif length <= 35:
final_score = min(final_score, 7.5)
else:
final_score = min(final_score, 8.0)
# Ensure long paragraphs get a minimum boost
if length > 35 and final_score < 6:
final_score = 6.0
return round(final_score, 2)
@property
def level(self):
return self._level
class UserVocabularyLevel(VocabularyLevelEstimator):
def __init__(self, d):
super().__init__()
self.d = d
# 取最新的三个单词(根据时间戳排序)
sorted_words = sorted(d.items(), key=lambda x: x[1][0], reverse=True)[:3]
self.word_lst = [word for word, _ in sorted_words if word.isalpha()]
@property
def level(self):
if not self.word_lst:
return 0.0 # 如果没有有效单词返回0
# 查找词汇表中这些词的等级
levels = [self.VOCAB_LEVELS.get(word.lower(), 0) for word in self.word_lst]
if not levels:
return 0.0
# 计算加权平均等级(最近的单词权重更高)
weights = [3, 2, 1] # 三个单词的权重
weighted_sum = sum(l * w for l, w in zip(levels[:3], weights[:len(levels)]))
total_weight = sum(weights[:len(levels)])
avg_level = weighted_sum / total_weight
# 对简单词汇给予一定的下限保护
min_level = max(levels) * 0.5 # 最低不低于最高等级的一半
final_level = max(avg_level, min_level)
return min(round(final_level, 2), 8.0) # 上限不超过8.0