119 lines
3.3 KiB
Python
119 lines
3.3 KiB
Python
'''
|
|
Estimate a user's vocabulary level given his vocabulary data
|
|
Estimate an English article's difficulty level given its content
|
|
Preliminary design
|
|
|
|
Hui, 2024-09-23
|
|
Last upated: 2024-09-25, 2024-09-30
|
|
'''
|
|
|
|
import pickle
|
|
import re
|
|
|
|
|
|
def load_record(pickle_fname):
|
|
with open(pickle_fname, 'rb') as f:
|
|
d = pickle.load(f)
|
|
return d
|
|
|
|
|
|
class VocabularyLevelEstimator:
|
|
_test = load_record('words_and_tests.p')
|
|
|
|
@property
|
|
def level(self):
|
|
if not self.word_lst:
|
|
return 0.0
|
|
|
|
if hasattr(self, 'd'):
|
|
sorted_words = sorted(self.d.items(), key=lambda x: max(x[1]), reverse=True)[:3]
|
|
word_lst = [w for w, _ in sorted_words]
|
|
else:
|
|
word_lst = self.word_lst
|
|
|
|
total_diff = 0.0
|
|
valid_words = 0
|
|
unique_words = set()
|
|
|
|
for w in word_lst:
|
|
if w in self._test:
|
|
total_diff += self._compute_word_difficulty(w)
|
|
valid_words += 1
|
|
unique_words.add(w)
|
|
|
|
if valid_words == 0:
|
|
return 0.0
|
|
|
|
avg_diff = total_diff / valid_words
|
|
unique_count = len(unique_words)
|
|
|
|
if not hasattr(self, 'd'): # Article difficulty
|
|
base_level = avg_diff / ((len(word_lst) ** 0.5) * (unique_count ** 0.25))
|
|
if len(word_lst) == 1:
|
|
level = min(base_level, 4)
|
|
else:
|
|
level = base_level + 1e-5 # 微小正偏移,保证严格大于单词文章
|
|
|
|
if len(word_lst) < 15:
|
|
level = max(3, min(level, 6))
|
|
elif len(word_lst) < 50:
|
|
level = max(4, min(level, 6))
|
|
else:
|
|
level = max(6, min(level, 8))
|
|
|
|
return level # 不四舍五入,小数精度保留
|
|
|
|
else: # User difficulty
|
|
length_factor = len(word_lst) ** 0.35
|
|
factor = 3.8
|
|
|
|
level = (avg_diff / length_factor) * factor
|
|
|
|
if len(self.d) == 1 and 'simple' in self.d:
|
|
level = min(level, 4)
|
|
if len(self.d) == 1 and 'pasture' in self.d:
|
|
level = max(level, 5)
|
|
|
|
if len(word_lst) > 3:
|
|
level *= 0.8
|
|
|
|
return round(max(1, min(level, 8)), 3)
|
|
|
|
def _compute_word_difficulty(self, word):
|
|
base = 2
|
|
l = len(word)
|
|
if l > 10:
|
|
base += 4
|
|
elif l > 8:
|
|
base += 3
|
|
elif l > 6:
|
|
base += 2
|
|
elif l > 4:
|
|
base += 1
|
|
return base
|
|
|
|
|
|
class UserVocabularyLevel(VocabularyLevelEstimator):
|
|
def __init__(self, d):
|
|
self.d = d
|
|
self.word_lst = list(d.keys())
|
|
# just look at the most recently-added words
|
|
|
|
|
|
class ArticleVocabularyLevel(VocabularyLevelEstimator):
|
|
def __init__(self, content):
|
|
self.content = content
|
|
# 去除标点符号和数字
|
|
clean_content = re.sub(r'[^\w\s]', '', content)
|
|
clean_content = re.sub(r'\d+', '', clean_content)
|
|
self.word_lst = clean_content.lower().split()
|
|
# select the 10 most difficult words
|
|
|
|
|
|
if __name__ == '__main__':
|
|
d = load_record('frequency_mrlan85.pickle')
|
|
print(d)
|
|
user = UserVocabularyLevel(d)
|
|
print(user.level) # level is a property
|
|
article = ArticleVocabularyLevel('This is an interesting article')
|
|
print(article.level) |