84 lines
2.5 KiB
Plaintext
84 lines
2.5 KiB
Plaintext
import pickle
|
|
import re
|
|
from collections import defaultdict
|
|
|
|
def load_record(pickle_fname):
|
|
with open(pickle_fname, 'rb') as f:
|
|
d = pickle.load(f)
|
|
return d
|
|
|
|
class VocabularyLevelEstimator:
|
|
_test = load_record('words_and_tests.p') # map a word to the sources where it appears
|
|
|
|
def __init__(self):
|
|
self.word_lst = []
|
|
|
|
def calculate_level(self, word):
|
|
"""Calculate difficulty level for a single word"""
|
|
if word in self._test:
|
|
if 'IELTS' in self._test[word]:
|
|
return 6
|
|
elif 'BBC' in self._test[word]:
|
|
return 5
|
|
elif 'CET6' in self._test[word]:
|
|
return 4
|
|
elif 'CET4' in self._test[word]:
|
|
return 3
|
|
elif 'OXFORD3000' in self._test[word]:
|
|
return 2
|
|
else:
|
|
return 1
|
|
else:
|
|
return 0
|
|
|
|
@property
|
|
def level(self):
|
|
if not self.word_lst:
|
|
return 0.0
|
|
|
|
# Calculate average difficulty of the words
|
|
total = sum(self.calculate_level(word) for word in self.word_lst)
|
|
return total / len(self.word_lst)
|
|
|
|
class UserVocabularyLevel(VocabularyLevelEstimator):
|
|
def __init__(self, d):
|
|
super().__init__()
|
|
self.d = d
|
|
self.word_lst = list(d.keys())
|
|
|
|
@property
|
|
def level(self):
|
|
if not self.word_lst:
|
|
return 0.0
|
|
|
|
# Only consider the most recent 3 words for user
|
|
recent_words = self.word_lst[:3]
|
|
|
|
# Calculate average difficulty of the recent words
|
|
total = sum(self.calculate_level(word) for word in recent_words)
|
|
return total / len(recent_words)
|
|
|
|
class ArticleVocabularyLevel(VocabularyLevelEstimator):
|
|
def __init__(self, content):
|
|
super().__init__()
|
|
self.content = content
|
|
|
|
# Preprocess content: remove punctuation and split into words
|
|
words = re.findall(r'\b[a-zA-Z]+\b', content.lower())
|
|
|
|
# Remove duplicates and sort by difficulty (descending)
|
|
unique_words = list(dict.fromkeys(words))
|
|
unique_words.sort(key=lambda w: self.calculate_level(w), reverse=True)
|
|
|
|
# Select top 10 difficult words
|
|
self.word_lst = unique_words[:10]
|
|
|
|
if __name__ == '__main__':
|
|
# 示例用法
|
|
# d = load_record('frequency_mrlan85.pickle')
|
|
# print(d)
|
|
# user = UserVocabularyLevel(d)
|
|
# print(user.level) # level is a property
|
|
# article = ArticleVocabularyLevel('This is an interesting article')
|
|
# print(article.level)
|
|
pass |