2025-06-09 14:00:07 +08:00
|
|
|
|
import os
|
2025-06-06 18:17:23 +08:00
|
|
|
|
import pickle
|
2025-06-09 14:00:07 +08:00
|
|
|
|
import random
|
2025-06-06 18:17:23 +08:00
|
|
|
|
import re
|
2025-06-09 14:00:07 +08:00
|
|
|
|
from collections import defaultdict
|
|
|
|
|
from datetime import datetime, timedelta
|
2025-06-06 18:17:23 +08:00
|
|
|
|
|
2025-06-09 14:00:07 +08:00
|
|
|
|
import snowballstemmer
|
|
|
|
|
from flask import session
|
2025-06-06 18:17:23 +08:00
|
|
|
|
|
2025-06-09 14:00:07 +08:00
|
|
|
|
|
|
|
|
|
# 词汇表加载
|
2025-06-06 18:17:23 +08:00
|
|
|
|
def load_record(pickle_fname):
|
2025-06-09 14:00:07 +08:00
|
|
|
|
with open(pickle_fname, 'rb') as f:
|
|
|
|
|
d = pickle.load(f)
|
|
|
|
|
return d
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 判断是否是英文单词
|
|
|
|
|
def is_english_word(word):
|
|
|
|
|
pattern = re.compile(r'^[a-zA-Z]+$')
|
|
|
|
|
return bool(pattern.match(word))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 去除非单词字符
|
|
|
|
|
def remove_non_words(input_string):
|
|
|
|
|
cleaned_string = re.sub(r'[^a-zA-Z\s]', '', input_string)
|
|
|
|
|
words = cleaned_string.split()
|
|
|
|
|
return ' '.join(words)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 主类:词汇水平估算器
|
2025-06-06 18:17:23 +08:00
|
|
|
|
class VocabularyLevelEstimator:
|
2025-06-09 14:00:07 +08:00
|
|
|
|
# 词汇表(单词:【"雅思","高考"...】)
|
2025-06-09 17:32:14 +08:00
|
|
|
|
_test = load_record('static/words_and_tests.p') # 词汇到测试来源的映射
|
2025-06-06 18:17:23 +08:00
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def level(self):
|
2025-06-09 17:32:14 +08:00
|
|
|
|
total = 0.0
|
|
|
|
|
num = 0
|
2025-06-06 18:17:23 +08:00
|
|
|
|
for word in self.word_lst:
|
|
|
|
|
if word in self._test:
|
2025-06-09 17:32:14 +08:00
|
|
|
|
total += self._test[word] # Assuming _test[word] returns a difficulty score
|
|
|
|
|
num += 1
|
|
|
|
|
return total / num if num > 0 else 0.0
|
2025-06-06 18:17:23 +08:00
|
|
|
|
|
2025-06-09 14:00:07 +08:00
|
|
|
|
def get_word_level(self, word):
|
|
|
|
|
# 常见高频词汇列表
|
|
|
|
|
other = ['went', 'heard', 'i', 'feet', 'got', 'been', 'gone', 'done', 'had', 'said', 'seen', 'made',
|
|
|
|
|
'taken', 'come', 'gotten', 'got ', 'ran', 'eaten', 'written', 'found', 'given', 'told',
|
|
|
|
|
'brought', 'kept', 'stood', 'sat', 'won', 'bought', 'caught', 'begun', 'drank', 'rang', 'sang',
|
|
|
|
|
'swam', 'blew', 'drew', 'flew', 'grew', 'knew', 'threw', 'shown', 'broken', 'chosen', 'forgotten',
|
|
|
|
|
'spoke', 'woke', 'woken', 'driven', 'fell', 'given', 'risen', 'mistaken', 'ridden', 'lain', 'lied']
|
|
|
|
|
|
|
|
|
|
if word not in self._test: return 3 # 如果词汇不在测试数据中,返回3级
|
|
|
|
|
if word in other: return 3 # 如果是常见的高频词汇,返回3级
|
|
|
|
|
|
|
|
|
|
k = self._test[word]
|
|
|
|
|
# 根据词汇的来源设置等级
|
|
|
|
|
if 'CET4' in k:
|
|
|
|
|
return 4
|
|
|
|
|
elif 'OXFORD3000' in k:
|
|
|
|
|
return 5
|
|
|
|
|
elif 'CET6' in k or 'GRADUATE' in k:
|
|
|
|
|
return 6
|
|
|
|
|
elif 'OXFORD5000' in k or 'IELTS' in k:
|
|
|
|
|
return 7
|
|
|
|
|
elif 'BBC' in k:
|
|
|
|
|
return 8
|
|
|
|
|
|
|
|
|
|
# 用户词汇水平类
|
2025-06-06 18:17:23 +08:00
|
|
|
|
class UserVocabularyLevel(VocabularyLevelEstimator):
|
2025-06-09 14:00:07 +08:00
|
|
|
|
# 过滤后的用户生词库
|
|
|
|
|
filtered_frequency = []
|
|
|
|
|
|
2025-06-06 18:17:23 +08:00
|
|
|
|
def __init__(self, d):
|
2025-06-09 14:00:07 +08:00
|
|
|
|
if d:
|
|
|
|
|
self.d = d # 用户的生词库
|
|
|
|
|
self.word_lst = list(d.keys())
|
|
|
|
|
self.filter_user_frequency()
|
|
|
|
|
|
|
|
|
|
def filter_user_frequency(self):
|
|
|
|
|
stemmer = snowballstemmer.stemmer('english')
|
2025-06-06 18:17:23 +08:00
|
|
|
|
|
2025-06-09 14:00:07 +08:00
|
|
|
|
self.filtered_frequency = []
|
|
|
|
|
|
|
|
|
|
for word in self.d:
|
2025-06-09 17:32:14 +08:00
|
|
|
|
if is_english_word(word) :
|
|
|
|
|
if word not in self.filtered_frequency:
|
2025-06-09 14:00:07 +08:00
|
|
|
|
self.filtered_frequency.append(stemmer.stemWord(word))
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def level(self):
|
|
|
|
|
total = 0.0
|
|
|
|
|
num = 0
|
|
|
|
|
if not self.filtered_frequency: return 0.0
|
|
|
|
|
for word in self.filtered_frequency:
|
|
|
|
|
num += 1
|
|
|
|
|
total += self.get_word_level(word)
|
|
|
|
|
return total / num if num else 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 文章词汇难度类
|
2025-06-06 18:17:23 +08:00
|
|
|
|
class ArticleVocabularyLevel(VocabularyLevelEstimator):
|
2025-06-09 14:00:07 +08:00
|
|
|
|
difficulty_word = dict()
|
|
|
|
|
|
2025-06-06 18:17:23 +08:00
|
|
|
|
def __init__(self, content):
|
2025-06-09 14:00:07 +08:00
|
|
|
|
if content:
|
|
|
|
|
self.content = remove_non_words(content)
|
|
|
|
|
self.word_lst = self.content.lower().split()
|
|
|
|
|
self.select_difficulty_word()
|
|
|
|
|
|
|
|
|
|
def select_difficulty_word(self, n=10):
|
|
|
|
|
self.difficulty_word = {}
|
|
|
|
|
stemmer = snowballstemmer.stemmer('english')
|
|
|
|
|
for word in self.word_lst:
|
|
|
|
|
original_word = stemmer.stemWord(word)
|
|
|
|
|
self.difficulty_word[original_word] = self.get_word_level(original_word)
|
|
|
|
|
|
|
|
|
|
if self.difficulty_word:
|
|
|
|
|
sorted_words = sorted(self.difficulty_word.items(), key=lambda item: item[1], reverse=True)
|
|
|
|
|
top_words = sorted_words[:n]
|
|
|
|
|
self.difficulty_word = {word: difficulty for word, difficulty in top_words}
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def level(self):
|
|
|
|
|
total = 0.0
|
|
|
|
|
num = 0
|
|
|
|
|
if not self.difficulty_word: return 0.0
|
|
|
|
|
for word in self.difficulty_word:
|
|
|
|
|
num += 1
|
|
|
|
|
total += self.difficulty_word[word]
|
|
|
|
|
return total / num if num else 0.0
|
|
|
|
|
|
2025-06-06 18:17:23 +08:00
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
2025-06-09 14:00:07 +08:00
|
|
|
|
d = load_record('static/frequency/frequency_sb.pickle') # 加载用户词汇数据
|
2025-06-06 18:17:23 +08:00
|
|
|
|
print(d)
|
2025-06-09 14:00:07 +08:00
|
|
|
|
|
2025-06-06 18:17:23 +08:00
|
|
|
|
user = UserVocabularyLevel(d)
|
2025-06-09 14:00:07 +08:00
|
|
|
|
print('用户词汇水平:')
|
|
|
|
|
print(user.level) # 输出用户的词汇水平
|
|
|
|
|
|
|
|
|
|
s = """Energetic = haze dynamic = vigorous = animated Such is Love , Plain like Water
|
|
|
|
|
port him to stand up. She scolded him for not having waken her up. He said that he could manage. A serious quarrel was about to burst out again.
|
|
|
|
|
I called them from Zhuhai, the beautiful city of relaxation and exciting views. I wanted to depict to them how pretty a city Zhuhai is."""
|
|
|
|
|
|
|
|
|
|
article = ArticleVocabularyLevel(s)
|
|
|
|
|
print('文章词汇难度:')
|
|
|
|
|
print(article.level) # 输出文章的词汇难度
|
|
|
|
|
|
|
|
|
|
# 测试文章保存
|
|
|
|
|
with open('test/article_test.p', 'wb') as file:
|
|
|
|
|
pickle.dump(s, file)
|
|
|
|
|
|
|
|
|
|
with open('test/article_test.p', 'rb') as file:
|
|
|
|
|
loaded_data = pickle.load(file)
|
|
|
|
|
print(loaded_data)
|
2025-06-09 17:32:14 +08:00
|
|
|
|
|
|
|
|
|
article1 = ArticleVocabularyLevel('source')
|
|
|
|
|
article2 = ArticleVocabularyLevel('open source')
|
|
|
|
|
|
|
|
|
|
|