1
0
Fork 0
EnglishPal/app/AAA_VocabularyLevelEstimato...

123 lines
5.8 KiB
Python
Raw Normal View History

2025-05-26 19:20:32 +08:00
import pickle
import re
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
from gensim.models import KeyedVectors
from wordfreqCMD import remove_punctuation, freq
from gensim.models import KeyedVectors
class VocabularyLevelEstimator:
def __init__(self, word_vectors_path='./wiki-news-300d-1M.vec', words_and_tests_pickle='words_and_tests.p'):
print(f"Loading word vectors from: {word_vectors_path}") # 打印路径以确认
try:
# 使用 load_word2vec_format 加载文本格式的词向量文件
self.word_vectors = KeyedVectors.load_word2vec_format(word_vectors_path, binary=False)
except Exception as e:
print(f"Failed to load word vectors with error: {e}")
self.word_vectors = None # 或者提供一个默认值或处理异常
self.words_and_tests = self.load_record(words_and_tests_pickle)
self.difficulty_cache = {}
self.stop_words = set() # 初始化停用词集合
self._initialize_stop_words()
def _initialize_stop_words(self):
"""初始化停用词"""
stop_words_list = [
'the', 'and', 'of', 'to', 'what', 'in', 'there', 'when', 'them', 'would', 'will', 'out',
'his', 'mr', 'that', 'up', 'more', 'your', 'it', 'now', 'very', 'then', 'could', 'he',
'any', 'some', 'with', 'into', 'you', 'our', 'man', 'other', 'time', 'was', 'than',
'know', 'about', 'only', 'like', 'how', 'see', 'is', 'before', 'such', 'little', 'two',
'its', 'as', 'these', 'may', 'much', 'down', 'for', 'well', 'should', 'those', 'after',
'same', 'must', 'say', 'first', 'again', 'us', 'great', 'where', 'being', 'come', 'over',
'good', 'himself', 'am', 'never', 'on', 'old', 'here', 'way', 'at', 'go', 'upon', 'have',
'had', 'without', 'my', 'day', 'be', 'but', 'though', 'from', 'not', 'too', 'another',
'this', 'even', 'still', 'her', 'yet', 'under', 'by', 'let', 'just', 'all', 'because',
'we', 'always', 'off', 'yes', 'so', 'while', 'why', 'which', 'me', 'are', 'or', 'no',
'if', 'an', 'also', 'thus', 'who', 'cannot', 'she', 'whether'
]
self.stop_words.update(stop_words_list)
def load_record(self, pickle_fname):
"""从pickle文件加载记录。"""
with open(pickle_fname, 'rb') as f:
return pickle.load(f)
def estimate_word_difficulty(self, word):
"""
根据预训练的词向量模型估计单词的难度
如果单词不在模型中则根据其是否出现在标准词汇表中给出默认值
"""
if word in self.difficulty_cache:
return self.difficulty_cache[word]
# 添加对非英文字符的支持
if not re.match(r'^[a-zA-Z]+$', word):
difficulty = 0.0
else:
difficulty = 5.0 # 默认难度等级,可根据实际情况调整
if word in self.word_vectors:
common_words = ['the', 'is', 'at', 'which', 'on']
try:
similarity_scores = [self.word_vectors.similarity(word, cw) for cw in common_words]
avg_similarity = sum(similarity_scores) / len(similarity_scores)
difficulty = min(10.0, max(1.0, round(5.0 - avg_similarity * 2))) # 将相似度映射到1-10的范围
except Exception as e:
print(f'Error estimating difficulty for word {word}: {e}')
difficulty = 5.0 # 设定一个默认值
self.difficulty_cache[word] = difficulty
return difficulty
def estimate_text_difficulty(self, text):
"""
估计给定文本的词汇难度
:param text: 英语文章文本
:return: 文本的平均词汇难度等级
"""
cleaned_text = remove_punctuation(text.lower())
words_frequency = freq(cleaned_text)
difficulties = []
with ThreadPoolExecutor() as executor:
# 并行处理每个单词的难度估计
future_to_word = {executor.submit(self.estimate_word_difficulty, word): word for word, _ in words_frequency if word not in self.stop_words}
for future in future_to_word:
try:
difficulties.append(float(future.result())) # 确保结果是浮点数
except Exception as exc:
print(f'Error estimating difficulty for word {future_to_word[future]}: {exc}')
if difficulties:
return float(sum(difficulties) / len(difficulties)) # 确保返回值是浮点数
else:
return 0.0 # 如果没有可评估的单词则返回难度为0.0
def estimate_user_vocabulary_level(self, user_unknown_words):
"""
根据用户不认识的单词估计用户的词汇水平
:param user_unknown_words: 用户不认识的单词列表
:return: 用户的平均词汇难度等级
"""
unknown_words_difficulty = [self.estimate_word_difficulty(word) for word in user_unknown_words if word not in self.stop_words]
if unknown_words_difficulty:
return sum(unknown_words_difficulty) / len(unknown_words_difficulty)
else:
return 0 # 如果没有不认识的单词则返回难度为0
# 示例使用
if __name__ == '__main__':
estimator = VocabularyLevelEstimator(word_vectors_path='path_to_pretrained_word_vectors')
# 假设我们有一个用户不认识的单词列表
user_unknown_words = ['abandon', 'pursuit', 'knowledge']
print("User vocabulary level:", estimator.estimate_user_vocabulary_level(user_unknown_words))
# 读取并估计一段文本的难度
with open('wordlist.txt', 'r') as f:
text = f.read()
print("Text difficulty level:", estimator.estimate_text_difficulty(text))