Add test_vocabulary_output_2025_06_05.txt
parent
7a15563d9d
commit
219fdbc4ea
|
@ -0,0 +1,84 @@
|
|||
import pickle
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
def load_record(pickle_fname):
|
||||
with open(pickle_fname, 'rb') as f:
|
||||
d = pickle.load(f)
|
||||
return d
|
||||
|
||||
class VocabularyLevelEstimator:
|
||||
_test = load_record('words_and_tests.p') # map a word to the sources where it appears
|
||||
|
||||
def __init__(self):
|
||||
self.word_lst = []
|
||||
|
||||
def calculate_level(self, word):
|
||||
"""Calculate difficulty level for a single word"""
|
||||
if word in self._test:
|
||||
if 'IELTS' in self._test[word]:
|
||||
return 6
|
||||
elif 'BBC' in self._test[word]:
|
||||
return 5
|
||||
elif 'CET6' in self._test[word]:
|
||||
return 4
|
||||
elif 'CET4' in self._test[word]:
|
||||
return 3
|
||||
elif 'OXFORD3000' in self._test[word]:
|
||||
return 2
|
||||
else:
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
@property
|
||||
def level(self):
|
||||
if not self.word_lst:
|
||||
return 0.0
|
||||
|
||||
# Calculate average difficulty of the words
|
||||
total = sum(self.calculate_level(word) for word in self.word_lst)
|
||||
return total / len(self.word_lst)
|
||||
|
||||
class UserVocabularyLevel(VocabularyLevelEstimator):
|
||||
def __init__(self, d):
|
||||
super().__init__()
|
||||
self.d = d
|
||||
self.word_lst = list(d.keys())
|
||||
|
||||
@property
|
||||
def level(self):
|
||||
if not self.word_lst:
|
||||
return 0.0
|
||||
|
||||
# Only consider the most recent 3 words for user
|
||||
recent_words = self.word_lst[:3]
|
||||
|
||||
# Calculate average difficulty of the recent words
|
||||
total = sum(self.calculate_level(word) for word in recent_words)
|
||||
return total / len(recent_words)
|
||||
|
||||
class ArticleVocabularyLevel(VocabularyLevelEstimator):
|
||||
def __init__(self, content):
|
||||
super().__init__()
|
||||
self.content = content
|
||||
|
||||
# Preprocess content: remove punctuation and split into words
|
||||
words = re.findall(r'\b[a-zA-Z]+\b', content.lower())
|
||||
|
||||
# Remove duplicates and sort by difficulty (descending)
|
||||
unique_words = list(dict.fromkeys(words))
|
||||
unique_words.sort(key=lambda w: self.calculate_level(w), reverse=True)
|
||||
|
||||
# Select top 10 difficult words
|
||||
self.word_lst = unique_words[:10]
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 示例用法
|
||||
# d = load_record('frequency_mrlan85.pickle')
|
||||
# print(d)
|
||||
# user = UserVocabularyLevel(d)
|
||||
# print(user.level) # level is a property
|
||||
# article = ArticleVocabularyLevel('This is an interesting article')
|
||||
# print(article.level)
|
||||
pass
|
Loading…
Reference in New Issue