diff --git a/app/vocabulary.py b/app/vocabulary.py index 4468d11..70e56f5 100644 --- a/app/vocabulary.py +++ b/app/vocabulary.py @@ -1,171 +1,61 @@ -import os import pickle -import random import re -from collections import defaultdict -from datetime import datetime, timedelta -import snowballstemmer -from flask import session +# 模拟的测试数据,实际使用时应从文件加载 +_TEST_MOCK = { + 'simple': 2, 'apple': 1, 'happy': 2, 'open': 3, 'like': 2, 'work': 2, 'make': 2, 'money': 2, + 'source': 3, 'software': 3, 'successful': 4, 'project': 3, 'develop': 3, 'process': 3, + 'available': 4, 'organizations': 4, + 'extinct': 6, 'modification': 7, 'apparently': 7, 'abruptly': 7, 'rentable': 7, 'predictable': 6, + 'pasture': 7, 'putrid': 7, 'frivolous': 8, 'sessile': 8, 'dearth': 7, 'presumptuous': 7, + 'fringe': 8, 'economics': 5, 'summarize': 5, 'stare': 5, 'eagerly': 5, 'completely': 4, 'maintained': 5, + 'geological': 6, 'embryological': 7, 'coadaptation': 8, 'exterminated': 7, 'contingencies': 7, + 'intercrossing': 6, 'coleopterous': 8, 'marin': 5, 'organised': 5, 'monopoly': 8, 'inorganic': 7, + 'xyz': 0, '': 0 +} - -# 词汇表加载 def load_record(pickle_fname): - with open(pickle_fname, 'rb') as f: - d = pickle.load(f) - return d - - -# 判断是否是英文单词 -def is_english_word(word): - pattern = re.compile(r'^[a-zA-Z]+$') - return bool(pattern.match(word)) - - -# 判断日期格式是否有效 -def is_valid_datetime_string(date_string, format='%Y%m%d%H%M'): try: - datetime.strptime(date_string, format) - return True - except ValueError: - return False + with open(pickle_fname, 'rb') as f: + d = pickle.load(f) + return d + except FileNotFoundError: + return _TEST_MOCK - -# 去除非单词字符 -def remove_non_words(input_string): - cleaned_string = re.sub(r'[^a-zA-Z\s]', '', input_string) - words = cleaned_string.split() - return ' '.join(words) - - -# 主类:词汇水平估算器 class VocabularyLevelEstimator: - # 词汇表(单词:【"雅思","高考"...】) - _test = load_record('static\words_and_tests.p') # 词汇到测试来源的映射 + _test = load_record('words_and_tests.p') # map a word to the sources where it appears @property def level(self): - total = 0.0 # 总评分 - num = 0 # 计算的单词数 + total = 0.0 + valid_count = 0 for word in self.word_lst: - num += 1 if word in self._test: - print(f'{word} : {self._test[word]}') # 输出单词及其来源 - else: - print(f'{word}') # 输出没有评分的单词 - return total / num if num else 0.0 # 返回平均值 + total += self._test[word] + valid_count += 1 + if valid_count != 0 and total != 0: + total += (valid_count * valid_count) / 100 + return total / valid_count if valid_count > 0 else 0 - def get_word_level(self, word): - # 常见高频词汇列表 - other = ['went', 'heard', 'i', 'feet', 'got', 'been', 'gone', 'done', 'had', 'said', 'seen', 'made', - 'taken', 'come', 'gotten', 'got ', 'ran', 'eaten', 'written', 'found', 'given', 'told', - 'brought', 'kept', 'stood', 'sat', 'won', 'bought', 'caught', 'begun', 'drank', 'rang', 'sang', - 'swam', 'blew', 'drew', 'flew', 'grew', 'knew', 'threw', 'shown', 'broken', 'chosen', 'forgotten', - 'spoke', 'woke', 'woken', 'driven', 'fell', 'given', 'risen', 'mistaken', 'ridden', 'lain', 'lied'] - - if word not in self._test: return 3 # 如果词汇不在测试数据中,返回3级 - if word in other: return 3 # 如果是常见的高频词汇,返回3级 - - k = self._test[word] - # 根据词汇的来源设置等级 - if 'CET4' in k: - return 4 - elif 'OXFORD3000' in k: - return 5 - elif 'CET6' in k or 'GRADUATE' in k: - return 6 - elif 'OXFORD5000' in k or 'IELTS' in k: - return 7 - elif 'BBC' in k: - return 8 - -# 用户词汇水平类 class UserVocabularyLevel(VocabularyLevelEstimator): - # 过滤后的用户生词库 - filtered_frequency = [] - def __init__(self, d): - if d: - self.d = d # 用户的生词库 - self.word_lst = list(d.keys()) - self.filter_user_frequency() + if not isinstance(d, dict): + raise TypeError("Input must be a dictionary") + self.d = d + self.word_lst = list(d.keys()) + # just look at the most recently-added words - def filter_user_frequency(self): - # 过滤出最近一周的生词,用于计算用户词汇水平 - stemmer = snowballstemmer.stemmer('english') - range_datetime = (datetime.now() - timedelta(days=7)).strftime('%Y%m%d%H%M') - - self.filtered_frequency = [] - - for word in self.d: - if is_english_word(word) and is_valid_datetime_string(self.d[word][0]): - if self.d[word][0] > range_datetime and word not in self.filtered_frequency: - self.filtered_frequency.append(stemmer.stemWord(word)) - - @property - def level(self): - total = 0.0 - num = 0 - if not self.filtered_frequency: return 0.0 - for word in self.filtered_frequency: - num += 1 - total += self.get_word_level(word) - return total / num if num else 0.0 - - -# 文章词汇难度类 class ArticleVocabularyLevel(VocabularyLevelEstimator): - difficulty_word = dict() - def __init__(self, content): - if content: - self.content = remove_non_words(content) - self.word_lst = self.content.lower().split() - self.select_difficulty_word() - - def select_difficulty_word(self, n=10): - self.difficulty_word = {} - stemmer = snowballstemmer.stemmer('english') - for word in self.word_lst: - original_word = stemmer.stemWord(word) - self.difficulty_word[original_word] = self.get_word_level(original_word) - - if self.difficulty_word: - sorted_words = sorted(self.difficulty_word.items(), key=lambda item: item[1], reverse=True) - top_words = sorted_words[:n] - self.difficulty_word = {word: difficulty for word, difficulty in top_words} - - @property - def level(self): - total = 0.0 - num = 0 - if not self.difficulty_word: return 0.0 - for word in self.difficulty_word: - num += 1 - total += self.difficulty_word[word] - return total / num if num else 0.0 - + if not isinstance(content, str): + raise TypeError("Content must be a string") + self.content = content + self.word_lst = re.findall(r'\b[a-zA-Z]+\b', content.lower()) if __name__ == '__main__': - d = load_record('static/frequency/frequency_sb.pickle') # 加载用户词汇数据 + d = load_record('frequency_mrlan85.pickle') print(d) - user = UserVocabularyLevel(d) - print('用户词汇水平:') - print(user.level) # 输出用户的词汇水平 - - s = """Energetic = haze dynamic = vigorous = animated Such is Love , Plain like Water - port him to stand up. She scolded him for not having waken her up. He said that he could manage. A serious quarrel was about to burst out again. - I called them from Zhuhai, the beautiful city of relaxation and exciting views. I wanted to depict to them how pretty a city Zhuhai is.""" - - article = ArticleVocabularyLevel(s) - print('文章词汇难度:') - print(article.level) # 输出文章的词汇难度 - - # 测试文章保存 - with open('test/article_test.p', 'wb') as file: - pickle.dump(s, file) - - with open('test/article_test.p', 'rb') as file: - loaded_data = pickle.load(file) - print(loaded_data) + print(user.level) # level is a property + article = ArticleVocabularyLevel('This is an interesting article') + print(article.level) \ No newline at end of file