From cea015f18a164d37f563e9291928844e230e0ea9 Mon Sep 17 00:00:00 2001 From: wanglulu <3409274047@qq.com> Date: Mon, 9 Jun 2025 14:00:07 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E6=B5=8B=E8=AF=95=E9=94=99?= =?UTF-8?q?=E8=AF=AFarticle=E4=B8=AD=E7=9A=84=E6=8F=90=E7=A4=BA=E4=BF=A1?= =?UTF-8?q?=E6=81=AF=E5=B9=B6=E5=9B=9E=E9=80=80vocabulary=E7=89=88?= =?UTF-8?q?=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/Article.py | 2 +- app/vocabulary.py | 194 ++++++++++++++++++++++++++++++++++++---------- 2 files changed, 153 insertions(+), 43 deletions(-) diff --git a/app/Article.py b/app/Article.py index cc96b1d..4af1f90 100644 --- a/app/Article.py +++ b/app/Article.py @@ -116,7 +116,7 @@ def get_today_article(user_word_list, visited_articles): "article_body": "This is a default article for testing purposes.", "source": "Default Source", "question": "What is this article about?", - "answer": "It's a default article for testing.", + "answer": "It's a default article for testing.Please contact the administrator", "ratio": 0.0 } else: diff --git a/app/vocabulary.py b/app/vocabulary.py index 70e56f5..4468d11 100644 --- a/app/vocabulary.py +++ b/app/vocabulary.py @@ -1,61 +1,171 @@ +import os import pickle +import random import re +from collections import defaultdict +from datetime import datetime, timedelta -# 模拟的测试数据,实际使用时应从文件加载 -_TEST_MOCK = { - 'simple': 2, 'apple': 1, 'happy': 2, 'open': 3, 'like': 2, 'work': 2, 'make': 2, 'money': 2, - 'source': 3, 'software': 3, 'successful': 4, 'project': 3, 'develop': 3, 'process': 3, - 'available': 4, 'organizations': 4, - 'extinct': 6, 'modification': 7, 'apparently': 7, 'abruptly': 7, 'rentable': 7, 'predictable': 6, - 'pasture': 7, 'putrid': 7, 'frivolous': 8, 'sessile': 8, 'dearth': 7, 'presumptuous': 7, - 'fringe': 8, 'economics': 5, 'summarize': 5, 'stare': 5, 'eagerly': 5, 'completely': 4, 'maintained': 5, - 'geological': 6, 'embryological': 7, 'coadaptation': 8, 'exterminated': 7, 'contingencies': 7, - 'intercrossing': 6, 'coleopterous': 8, 'marin': 5, 'organised': 5, 'monopoly': 8, 'inorganic': 7, - 'xyz': 0, '': 0 -} +import snowballstemmer +from flask import session + +# 词汇表加载 def load_record(pickle_fname): - try: - with open(pickle_fname, 'rb') as f: - d = pickle.load(f) - return d - except FileNotFoundError: - return _TEST_MOCK + with open(pickle_fname, 'rb') as f: + d = pickle.load(f) + return d + +# 判断是否是英文单词 +def is_english_word(word): + pattern = re.compile(r'^[a-zA-Z]+$') + return bool(pattern.match(word)) + + +# 判断日期格式是否有效 +def is_valid_datetime_string(date_string, format='%Y%m%d%H%M'): + try: + datetime.strptime(date_string, format) + return True + except ValueError: + return False + + +# 去除非单词字符 +def remove_non_words(input_string): + cleaned_string = re.sub(r'[^a-zA-Z\s]', '', input_string) + words = cleaned_string.split() + return ' '.join(words) + + +# 主类:词汇水平估算器 class VocabularyLevelEstimator: - _test = load_record('words_and_tests.p') # map a word to the sources where it appears + # 词汇表(单词:【"雅思","高考"...】) + _test = load_record('static\words_and_tests.p') # 词汇到测试来源的映射 + + @property + def level(self): + total = 0.0 # 总评分 + num = 0 # 计算的单词数 + for word in self.word_lst: + num += 1 + if word in self._test: + print(f'{word} : {self._test[word]}') # 输出单词及其来源 + else: + print(f'{word}') # 输出没有评分的单词 + return total / num if num else 0.0 # 返回平均值 + + def get_word_level(self, word): + # 常见高频词汇列表 + other = ['went', 'heard', 'i', 'feet', 'got', 'been', 'gone', 'done', 'had', 'said', 'seen', 'made', + 'taken', 'come', 'gotten', 'got ', 'ran', 'eaten', 'written', 'found', 'given', 'told', + 'brought', 'kept', 'stood', 'sat', 'won', 'bought', 'caught', 'begun', 'drank', 'rang', 'sang', + 'swam', 'blew', 'drew', 'flew', 'grew', 'knew', 'threw', 'shown', 'broken', 'chosen', 'forgotten', + 'spoke', 'woke', 'woken', 'driven', 'fell', 'given', 'risen', 'mistaken', 'ridden', 'lain', 'lied'] + + if word not in self._test: return 3 # 如果词汇不在测试数据中,返回3级 + if word in other: return 3 # 如果是常见的高频词汇,返回3级 + + k = self._test[word] + # 根据词汇的来源设置等级 + if 'CET4' in k: + return 4 + elif 'OXFORD3000' in k: + return 5 + elif 'CET6' in k or 'GRADUATE' in k: + return 6 + elif 'OXFORD5000' in k or 'IELTS' in k: + return 7 + elif 'BBC' in k: + return 8 + +# 用户词汇水平类 +class UserVocabularyLevel(VocabularyLevelEstimator): + # 过滤后的用户生词库 + filtered_frequency = [] + + def __init__(self, d): + if d: + self.d = d # 用户的生词库 + self.word_lst = list(d.keys()) + self.filter_user_frequency() + + def filter_user_frequency(self): + # 过滤出最近一周的生词,用于计算用户词汇水平 + stemmer = snowballstemmer.stemmer('english') + range_datetime = (datetime.now() - timedelta(days=7)).strftime('%Y%m%d%H%M') + + self.filtered_frequency = [] + + for word in self.d: + if is_english_word(word) and is_valid_datetime_string(self.d[word][0]): + if self.d[word][0] > range_datetime and word not in self.filtered_frequency: + self.filtered_frequency.append(stemmer.stemWord(word)) @property def level(self): total = 0.0 - valid_count = 0 - for word in self.word_lst: - if word in self._test: - total += self._test[word] - valid_count += 1 - if valid_count != 0 and total != 0: - total += (valid_count * valid_count) / 100 - return total / valid_count if valid_count > 0 else 0 + num = 0 + if not self.filtered_frequency: return 0.0 + for word in self.filtered_frequency: + num += 1 + total += self.get_word_level(word) + return total / num if num else 0.0 -class UserVocabularyLevel(VocabularyLevelEstimator): - def __init__(self, d): - if not isinstance(d, dict): - raise TypeError("Input must be a dictionary") - self.d = d - self.word_lst = list(d.keys()) - # just look at the most recently-added words +# 文章词汇难度类 class ArticleVocabularyLevel(VocabularyLevelEstimator): + difficulty_word = dict() + def __init__(self, content): - if not isinstance(content, str): - raise TypeError("Content must be a string") - self.content = content - self.word_lst = re.findall(r'\b[a-zA-Z]+\b', content.lower()) + if content: + self.content = remove_non_words(content) + self.word_lst = self.content.lower().split() + self.select_difficulty_word() + + def select_difficulty_word(self, n=10): + self.difficulty_word = {} + stemmer = snowballstemmer.stemmer('english') + for word in self.word_lst: + original_word = stemmer.stemWord(word) + self.difficulty_word[original_word] = self.get_word_level(original_word) + + if self.difficulty_word: + sorted_words = sorted(self.difficulty_word.items(), key=lambda item: item[1], reverse=True) + top_words = sorted_words[:n] + self.difficulty_word = {word: difficulty for word, difficulty in top_words} + + @property + def level(self): + total = 0.0 + num = 0 + if not self.difficulty_word: return 0.0 + for word in self.difficulty_word: + num += 1 + total += self.difficulty_word[word] + return total / num if num else 0.0 + if __name__ == '__main__': - d = load_record('frequency_mrlan85.pickle') + d = load_record('static/frequency/frequency_sb.pickle') # 加载用户词汇数据 print(d) + user = UserVocabularyLevel(d) - print(user.level) # level is a property - article = ArticleVocabularyLevel('This is an interesting article') - print(article.level) \ No newline at end of file + print('用户词汇水平:') + print(user.level) # 输出用户的词汇水平 + + s = """Energetic = haze dynamic = vigorous = animated Such is Love , Plain like Water + port him to stand up. She scolded him for not having waken her up. He said that he could manage. A serious quarrel was about to burst out again. + I called them from Zhuhai, the beautiful city of relaxation and exciting views. I wanted to depict to them how pretty a city Zhuhai is.""" + + article = ArticleVocabularyLevel(s) + print('文章词汇难度:') + print(article.level) # 输出文章的词汇难度 + + # 测试文章保存 + with open('test/article_test.p', 'wb') as file: + pickle.dump(s, file) + + with open('test/article_test.p', 'rb') as file: + loaded_data = pickle.load(file) + print(loaded_data)