From 219fdbc4eafe821dc26de9fb75d92692b9b8834b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=92=9F=E7=82=BD=E7=8E=AE?= <14937682+zhong-chiwei@user.noreply.gitee.com> Date: Sat, 14 Jun 2025 18:45:36 +0800 Subject: [PATCH] Add test_vocabulary_output_2025_06_05.txt --- test_vocabulary_output_2025_06_05.txt | 84 +++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 test_vocabulary_output_2025_06_05.txt diff --git a/test_vocabulary_output_2025_06_05.txt b/test_vocabulary_output_2025_06_05.txt new file mode 100644 index 0000000..36ad09f --- /dev/null +++ b/test_vocabulary_output_2025_06_05.txt @@ -0,0 +1,84 @@ +import pickle +import re +from collections import defaultdict + +def load_record(pickle_fname): + with open(pickle_fname, 'rb') as f: + d = pickle.load(f) + return d + +class VocabularyLevelEstimator: + _test = load_record('words_and_tests.p') # map a word to the sources where it appears + + def __init__(self): + self.word_lst = [] + + def calculate_level(self, word): + """Calculate difficulty level for a single word""" + if word in self._test: + if 'IELTS' in self._test[word]: + return 6 + elif 'BBC' in self._test[word]: + return 5 + elif 'CET6' in self._test[word]: + return 4 + elif 'CET4' in self._test[word]: + return 3 + elif 'OXFORD3000' in self._test[word]: + return 2 + else: + return 1 + else: + return 0 + + @property + def level(self): + if not self.word_lst: + return 0.0 + + # Calculate average difficulty of the words + total = sum(self.calculate_level(word) for word in self.word_lst) + return total / len(self.word_lst) + +class UserVocabularyLevel(VocabularyLevelEstimator): + def __init__(self, d): + super().__init__() + self.d = d + self.word_lst = list(d.keys()) + + @property + def level(self): + if not self.word_lst: + return 0.0 + + # Only consider the most recent 3 words for user + recent_words = self.word_lst[:3] + + # Calculate average difficulty of the recent words + total = sum(self.calculate_level(word) for word in recent_words) + return total / len(recent_words) + +class ArticleVocabularyLevel(VocabularyLevelEstimator): + def __init__(self, content): + super().__init__() + self.content = content + + # Preprocess content: remove punctuation and split into words + words = re.findall(r'\b[a-zA-Z]+\b', content.lower()) + + # Remove duplicates and sort by difficulty (descending) + unique_words = list(dict.fromkeys(words)) + unique_words.sort(key=lambda w: self.calculate_level(w), reverse=True) + + # Select top 10 difficult words + self.word_lst = unique_words[:10] + +if __name__ == '__main__': + # 示例用法 + # d = load_record('frequency_mrlan85.pickle') + # print(d) + # user = UserVocabularyLevel(d) + # print(user.level) # level is a property + # article = ArticleVocabularyLevel('This is an interesting article') + # print(article.level) + pass \ No newline at end of file