Add test_vocabulary_output_2025_06_05.txt

2025-06-14 18:45:36 +08:00 · 2025-06-14 18:45:36 +08:00 · 219fdbc4ea
parent 7a15563d9d
commit 219fdbc4ea
1 changed files with 84 additions and 0 deletions
--- a/test_vocabulary_output_2025_06_05.txt
+++ b/test_vocabulary_output_2025_06_05.txt
@ -0,0 +1,84 @@
+import pickle
+import re
+from collections import defaultdict
+
+def load_record(pickle_fname):
+    with open(pickle_fname, 'rb') as f:
+        d = pickle.load(f)
+    return d
+
+class VocabularyLevelEstimator:
+    _test = load_record('words_and_tests.p')  # map a word to the sources where it appears
+
+    def __init__(self):
+        self.word_lst = []
+
+    def calculate_level(self, word):
+        """Calculate difficulty level for a single word"""
+        if word in self._test:
+            if 'IELTS' in self._test[word]:
+                return 6
+            elif 'BBC' in self._test[word]:
+                return 5
+            elif 'CET6' in self._test[word]:
+                return 4
+            elif 'CET4' in self._test[word]:
+                return 3
+            elif 'OXFORD3000' in self._test[word]:
+                return 2
+            else:
+                return 1
+        else:
+            return 0
+
+    @property
+    def level(self):
+        if not self.word_lst:
+            return 0.0
+
+        # Calculate average difficulty of the words
+        total = sum(self.calculate_level(word) for word in self.word_lst)
+        return total / len(self.word_lst)
+
+class UserVocabularyLevel(VocabularyLevelEstimator):
+    def __init__(self, d):
+        super().__init__()
+        self.d = d
+        self.word_lst = list(d.keys())
+
+    @property
+    def level(self):
+        if not self.word_lst:
+            return 0.0
+
+        # Only consider the most recent 3 words for user
+        recent_words = self.word_lst[:3]
+
+        # Calculate average difficulty of the recent words
+        total = sum(self.calculate_level(word) for word in recent_words)
+        return total / len(recent_words)
+
+class ArticleVocabularyLevel(VocabularyLevelEstimator):
+    def __init__(self, content):
+        super().__init__()
+        self.content = content
+
+        # Preprocess content: remove punctuation and split into words
+        words = re.findall(r'\b[a-zA-Z]+\b', content.lower())
+
+        # Remove duplicates and sort by difficulty (descending)
+        unique_words = list(dict.fromkeys(words))
+        unique_words.sort(key=lambda w: self.calculate_level(w), reverse=True)
+
+        # Select top 10 difficult words
+        self.word_lst = unique_words[:10]
+
+if __name__ == '__main__':
+    # 示例用法
+    # d = load_record('frequency_mrlan85.pickle')
+    # print(d)
+    # user = UserVocabularyLevel(d)
+    # print(user.level)  # level is a property
+    # article = ArticleVocabularyLevel('This is an interesting article')
+    # print(article.level)
+    pass