1 changed files with 91 additions and 0 deletions
--- a/app/vocabulary.py
+++ b/app/vocabulary.py
@ -0,0 +1,91 @@
+'''
+   Estimate a user's vocabulary level given his vocabulary data
+   Estimate an English article's difficulty level given its content
+   Preliminary design
+
+   Hui, 2024-09-23
+   Last upated: 2024-09-25, 2024-09-30
+'''
+
+import pickle
+
+import nltk
+
+DIFFICULTY_MAPPING = {
+    'BBC': 2,       # 基础词汇
+    'CET4': 3,      # 四级（大学英语）
+    'CET6': 4,      # 六级
+    'GRADUATE': 5,  # 考研词汇
+    'IELTS': 6,     # 雅思
+    'OXFORD3000': 4, # 牛津3000核心词
+    'OXFORD5000': 7 # 牛津5000词
+}
+
+
+def load_record(pickle_fname):
+    with open(pickle_fname, 'rb') as f:
+        d = pickle.load(f)
+    return d
+
+
+class VocabularyLevelEstimator:
+    _test = load_record('words_and_tests.p') # map a word to the sources where it appears
+
+    @property
+    def level(self):
+        if not self.word_lst:  # 检查是否有有效词汇
+            return 0.0  # 或根据需求返回默认值
+        total = 0.0
+        valid_words = 0
+        for word in self.word_lst:
+            if word in self._test:
+                sources = self._test[word]
+                total += max(DIFFICULTY_MAPPING.get(src, 0) for src in sources)
+                valid_words += 1
+        return total / valid_words if valid_words > 0 else 0.0
+
+
+class UserVocabularyLevel(VocabularyLevelEstimator):
+    def __init__(self, d, recent_n=3):
+        self.d = d
+        self.recent_n = recent_n
+        # 按时间戳降序排序，取前recent_n个单词
+        sorted_words = sorted(d.keys(), key=lambda word: d[word][-1], reverse=True)
+        self.word_lst = sorted_words[:recent_n]
+
+
+class ArticleVocabularyLevel(VocabularyLevelEstimator):
+    def __init__(self, content):
+        self.content = content
+        # 预处理：分词、小写、去标点、去停用词
+        import re
+        from nltk.corpus import stopwords
+        nltk.download('stopwords')
+        stop_words = set(stopwords.words('english'))
+        words = re.findall(r'\b\w+\b', content.lower())
+        self.word_lst = [word for word in words if word not in stop_words]
+        # 按难度分筛选前10个最难的单词
+        self.word_lst = sorted(
+            self.word_lst,
+            key=lambda w: self._get_difficulty(w),
+            reverse=True
+        )[:10]
+
+    def _get_difficulty(self, word):
+        if word in self._test:
+            return max(DIFFICULTY_MAPPING.get(src, 0) for src in self._test[word])
+        return 0
+
+
+if __name__ == '__main__':
+    d = load_record('frequency_mrlan85.pickle')
+    print(d)
+    print("======================================================")
+    user = UserVocabularyLevel(d)
+    print(user.level) # level is a property
+    print("======================================================")
+    article = ArticleVocabularyLevel('This is an interesting article')
+    print(article.level)
+
+
+