From cea015f18a164d37f563e9291928844e230e0ea9 Mon Sep 17 00:00:00 2001
From: wanglulu <3409274047@qq.com>
Date: Mon, 9 Jun 2025 14:00:07 +0800
Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E6=B5=8B=E8=AF=95=E9=94=99?=
 =?UTF-8?q?=E8=AF=AFarticle=E4=B8=AD=E7=9A=84=E6=8F=90=E7=A4=BA=E4=BF=A1?=
 =?UTF-8?q?=E6=81=AF=E5=B9=B6=E5=9B=9E=E9=80=80vocabulary=E7=89=88?=
 =?UTF-8?q?=E6=9C=AC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 app/Article.py    |   2 +-
 app/vocabulary.py | 194 ++++++++++++++++++++++++++++++++++++----------
 2 files changed, 153 insertions(+), 43 deletions(-)

diff --git a/app/Article.py b/app/Article.py
index cc96b1d..4af1f90 100644
--- a/app/Article.py
+++ b/app/Article.py
@@ -116,7 +116,7 @@ def get_today_article(user_word_list, visited_articles):
             "article_body": "This is a default article for testing purposes.",
             "source": "Default Source",
             "question": "What is this article about?",
-            "answer": "It's a default article for testing.",
+            "answer": "It's a default article for testing.Please contact the administrator",
             "ratio": 0.0
         }
     else:
diff --git a/app/vocabulary.py b/app/vocabulary.py
index 70e56f5..4468d11 100644
--- a/app/vocabulary.py
+++ b/app/vocabulary.py
@@ -1,61 +1,171 @@
+import os
 import pickle
+import random
 import re
+from collections import defaultdict
+from datetime import datetime, timedelta
 
-# 模拟的测试数据，实际使用时应从文件加载
-_TEST_MOCK = {
-    'simple': 2, 'apple': 1, 'happy': 2, 'open': 3, 'like': 2, 'work': 2, 'make': 2, 'money': 2,
-    'source': 3, 'software': 3, 'successful': 4, 'project': 3, 'develop': 3, 'process': 3,
-    'available': 4, 'organizations': 4,
-    'extinct': 6, 'modification': 7, 'apparently': 7, 'abruptly': 7, 'rentable': 7, 'predictable': 6,
-    'pasture': 7, 'putrid': 7, 'frivolous': 8, 'sessile': 8, 'dearth': 7, 'presumptuous': 7,
-    'fringe': 8, 'economics': 5, 'summarize': 5, 'stare': 5, 'eagerly': 5, 'completely': 4, 'maintained': 5,
-    'geological': 6, 'embryological': 7, 'coadaptation': 8, 'exterminated': 7, 'contingencies': 7,
-    'intercrossing': 6, 'coleopterous': 8, 'marin': 5, 'organised': 5, 'monopoly': 8, 'inorganic': 7,
-    'xyz': 0, '': 0
-}
+import snowballstemmer
+from flask import session
 
+
+# 词汇表加载
 def load_record(pickle_fname):
-    try:
-        with open(pickle_fname, 'rb') as f:
-            d = pickle.load(f)
-        return d
-    except FileNotFoundError:
-        return _TEST_MOCK
+    with open(pickle_fname, 'rb') as f:
+        d = pickle.load(f)
+    return d
 
+
+# 判断是否是英文单词
+def is_english_word(word):
+    pattern = re.compile(r'^[a-zA-Z]+$')
+    return bool(pattern.match(word))
+
+
+# 判断日期格式是否有效
+def is_valid_datetime_string(date_string, format='%Y%m%d%H%M'):
+    try:
+        datetime.strptime(date_string, format)
+        return True
+    except ValueError:
+        return False
+
+
+# 去除非单词字符
+def remove_non_words(input_string):
+    cleaned_string = re.sub(r'[^a-zA-Z\s]', '', input_string)
+    words = cleaned_string.split()
+    return ' '.join(words)
+
+
+# 主类：词汇水平估算器
 class VocabularyLevelEstimator:
-    _test = load_record('words_and_tests.p')  # map a word to the sources where it appears
+    # 词汇表（单词：【"雅思","高考"...】）
+    _test = load_record('static\words_and_tests.p')  # 词汇到测试来源的映射
+
+    @property
+    def level(self):
+        total = 0.0  # 总评分
+        num = 0  # 计算的单词数
+        for word in self.word_lst:
+            num += 1
+            if word in self._test:
+                print(f'{word} : {self._test[word]}')  # 输出单词及其来源
+            else:
+                print(f'{word}')  # 输出没有评分的单词
+        return total / num if num else 0.0  # 返回平均值
+
+    def get_word_level(self, word):
+        # 常见高频词汇列表
+        other = ['went', 'heard', 'i', 'feet', 'got', 'been', 'gone', 'done', 'had', 'said', 'seen', 'made',
+                 'taken', 'come', 'gotten', 'got ', 'ran', 'eaten', 'written', 'found', 'given', 'told',
+                 'brought', 'kept', 'stood', 'sat', 'won', 'bought', 'caught', 'begun', 'drank', 'rang', 'sang',
+                 'swam', 'blew', 'drew', 'flew', 'grew', 'knew', 'threw', 'shown', 'broken', 'chosen', 'forgotten',
+                 'spoke', 'woke', 'woken', 'driven', 'fell', 'given', 'risen', 'mistaken', 'ridden', 'lain', 'lied']
+
+        if word not in self._test: return 3  # 如果词汇不在测试数据中，返回3级
+        if word in other: return 3  # 如果是常见的高频词汇，返回3级
+
+        k = self._test[word]
+        # 根据词汇的来源设置等级
+        if 'CET4' in k:
+            return 4
+        elif 'OXFORD3000' in k:
+            return 5
+        elif 'CET6' in k or 'GRADUATE' in k:
+            return 6
+        elif 'OXFORD5000' in k or 'IELTS' in k:
+            return 7
+        elif 'BBC' in k:
+            return 8
+
+# 用户词汇水平类
+class UserVocabularyLevel(VocabularyLevelEstimator):
+    # 过滤后的用户生词库
+    filtered_frequency = []
+
+    def __init__(self, d):
+        if d:
+            self.d = d  # 用户的生词库
+            self.word_lst = list(d.keys())
+            self.filter_user_frequency()
+
+    def filter_user_frequency(self):
+        # 过滤出最近一周的生词，用于计算用户词汇水平
+        stemmer = snowballstemmer.stemmer('english')
+        range_datetime = (datetime.now() - timedelta(days=7)).strftime('%Y%m%d%H%M')
+
+        self.filtered_frequency = []
+
+        for word in self.d:
+            if is_english_word(word) and is_valid_datetime_string(self.d[word][0]):
+                if self.d[word][0] > range_datetime and word not in self.filtered_frequency:
+                    self.filtered_frequency.append(stemmer.stemWord(word))
 
     @property
     def level(self):
         total = 0.0
-        valid_count = 0
-        for word in self.word_lst:
-            if word in self._test:
-                total += self._test[word]
-                valid_count += 1
-        if valid_count != 0 and total != 0:
-            total += (valid_count * valid_count) / 100
-        return total / valid_count if valid_count > 0 else 0
+        num = 0
+        if not self.filtered_frequency: return 0.0
+        for word in self.filtered_frequency:
+            num += 1
+            total += self.get_word_level(word)
+        return total / num if num else 0.0
 
-class UserVocabularyLevel(VocabularyLevelEstimator):
-    def __init__(self, d):
-        if not isinstance(d, dict):
-            raise TypeError("Input must be a dictionary")
-        self.d = d
-        self.word_lst = list(d.keys())
-        # just look at the most recently-added words
 
+# 文章词汇难度类
 class ArticleVocabularyLevel(VocabularyLevelEstimator):
+    difficulty_word = dict()
+
     def __init__(self, content):
-        if not isinstance(content, str):
-            raise TypeError("Content must be a string")
-        self.content = content
-        self.word_lst = re.findall(r'\b[a-zA-Z]+\b', content.lower())
+        if content:
+            self.content = remove_non_words(content)
+            self.word_lst = self.content.lower().split()
+            self.select_difficulty_word()
+
+    def select_difficulty_word(self, n=10):
+        self.difficulty_word = {}
+        stemmer = snowballstemmer.stemmer('english')
+        for word in self.word_lst:
+            original_word = stemmer.stemWord(word)
+            self.difficulty_word[original_word] = self.get_word_level(original_word)
+
+        if self.difficulty_word:
+            sorted_words = sorted(self.difficulty_word.items(), key=lambda item: item[1], reverse=True)
+            top_words = sorted_words[:n]
+            self.difficulty_word = {word: difficulty for word, difficulty in top_words}
+
+    @property
+    def level(self):
+        total = 0.0
+        num = 0
+        if not self.difficulty_word: return 0.0
+        for word in self.difficulty_word:
+            num += 1
+            total += self.difficulty_word[word]
+        return total / num if num else 0.0
+
 
 if __name__ == '__main__':
-    d = load_record('frequency_mrlan85.pickle')
+    d = load_record('static/frequency/frequency_sb.pickle')  # 加载用户词汇数据
     print(d)
+
     user = UserVocabularyLevel(d)
-    print(user.level)  # level is a property
-    article = ArticleVocabularyLevel('This is an interesting article')
-    print(article.level)
\ No newline at end of file
+    print('用户词汇水平：')
+    print(user.level)  # 输出用户的词汇水平
+
+    s = """Energetic = haze dynamic = vigorous = animated Such is Love , Plain like Water
+    port him to stand up. She scolded him for not having waken her up. He said that he could manage. A serious quarrel was about to burst out again.
+    I called them from Zhuhai, the beautiful city of relaxation and exciting views. I wanted to depict to them how pretty a city Zhuhai is."""
+
+    article = ArticleVocabularyLevel(s)
+    print('文章词汇难度：')
+    print(article.level)  # 输出文章的词汇难度
+
+    # 测试文章保存
+    with open('test/article_test.p', 'wb') as file:
+        pickle.dump(s, file)
+
+    with open('test/article_test.p', 'rb') as file:
+        loaded_data = pickle.load(file)
+        print(loaded_data)