1
0
Fork 0
EnglishPal/app/AAA_VocabularyLevelEstimato...

123 lines
5.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import pickle
import re
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
from gensim.models import KeyedVectors
from wordfreqCMD import remove_punctuation, freq
from gensim.models import KeyedVectors
class VocabularyLevelEstimator:
def __init__(self, word_vectors_path='./wiki-news-300d-1M.vec', words_and_tests_pickle='words_and_tests.p'):
print(f"Loading word vectors from: {word_vectors_path}") # 打印路径以确认
try:
# 使用 load_word2vec_format 加载文本格式的词向量文件
self.word_vectors = KeyedVectors.load_word2vec_format(word_vectors_path, binary=False)
except Exception as e:
print(f"Failed to load word vectors with error: {e}")
self.word_vectors = None # 或者提供一个默认值或处理异常
self.words_and_tests = self.load_record(words_and_tests_pickle)
self.difficulty_cache = {}
self.stop_words = set() # 初始化停用词集合
self._initialize_stop_words()
def _initialize_stop_words(self):
"""初始化停用词"""
stop_words_list = [
'the', 'and', 'of', 'to', 'what', 'in', 'there', 'when', 'them', 'would', 'will', 'out',
'his', 'mr', 'that', 'up', 'more', 'your', 'it', 'now', 'very', 'then', 'could', 'he',
'any', 'some', 'with', 'into', 'you', 'our', 'man', 'other', 'time', 'was', 'than',
'know', 'about', 'only', 'like', 'how', 'see', 'is', 'before', 'such', 'little', 'two',
'its', 'as', 'these', 'may', 'much', 'down', 'for', 'well', 'should', 'those', 'after',
'same', 'must', 'say', 'first', 'again', 'us', 'great', 'where', 'being', 'come', 'over',
'good', 'himself', 'am', 'never', 'on', 'old', 'here', 'way', 'at', 'go', 'upon', 'have',
'had', 'without', 'my', 'day', 'be', 'but', 'though', 'from', 'not', 'too', 'another',
'this', 'even', 'still', 'her', 'yet', 'under', 'by', 'let', 'just', 'all', 'because',
'we', 'always', 'off', 'yes', 'so', 'while', 'why', 'which', 'me', 'are', 'or', 'no',
'if', 'an', 'also', 'thus', 'who', 'cannot', 'she', 'whether'
]
self.stop_words.update(stop_words_list)
def load_record(self, pickle_fname):
"""从pickle文件加载记录。"""
with open(pickle_fname, 'rb') as f:
return pickle.load(f)
def estimate_word_difficulty(self, word):
"""
根据预训练的词向量模型估计单词的难度。
如果单词不在模型中,则根据其是否出现在标准词汇表中给出默认值。
"""
if word in self.difficulty_cache:
return self.difficulty_cache[word]
# 添加对非英文字符的支持
if not re.match(r'^[a-zA-Z]+$', word):
difficulty = 0.0
else:
difficulty = 5.0 # 默认难度等级,可根据实际情况调整
if word in self.word_vectors:
common_words = ['the', 'is', 'at', 'which', 'on']
try:
similarity_scores = [self.word_vectors.similarity(word, cw) for cw in common_words]
avg_similarity = sum(similarity_scores) / len(similarity_scores)
difficulty = min(10.0, max(1.0, round(5.0 - avg_similarity * 2))) # 将相似度映射到1-10的范围
except Exception as e:
print(f'Error estimating difficulty for word {word}: {e}')
difficulty = 5.0 # 设定一个默认值
self.difficulty_cache[word] = difficulty
return difficulty
def estimate_text_difficulty(self, text):
"""
估计给定文本的词汇难度。
:param text: 英语文章文本。
:return: 文本的平均词汇难度等级。
"""
cleaned_text = remove_punctuation(text.lower())
words_frequency = freq(cleaned_text)
difficulties = []
with ThreadPoolExecutor() as executor:
# 并行处理每个单词的难度估计
future_to_word = {executor.submit(self.estimate_word_difficulty, word): word for word, _ in words_frequency if word not in self.stop_words}
for future in future_to_word:
try:
difficulties.append(float(future.result())) # 确保结果是浮点数
except Exception as exc:
print(f'Error estimating difficulty for word {future_to_word[future]}: {exc}')
if difficulties:
return float(sum(difficulties) / len(difficulties)) # 确保返回值是浮点数
else:
return 0.0 # 如果没有可评估的单词则返回难度为0.0
def estimate_user_vocabulary_level(self, user_unknown_words):
"""
根据用户不认识的单词估计用户的词汇水平。
:param user_unknown_words: 用户不认识的单词列表。
:return: 用户的平均词汇难度等级。
"""
unknown_words_difficulty = [self.estimate_word_difficulty(word) for word in user_unknown_words if word not in self.stop_words]
if unknown_words_difficulty:
return sum(unknown_words_difficulty) / len(unknown_words_difficulty)
else:
return 0 # 如果没有不认识的单词则返回难度为0
# 示例使用
if __name__ == '__main__':
estimator = VocabularyLevelEstimator(word_vectors_path='path_to_pretrained_word_vectors')
# 假设我们有一个用户不认识的单词列表
user_unknown_words = ['abandon', 'pursuit', 'knowledge']
print("User vocabulary level:", estimator.estimate_user_vocabulary_level(user_unknown_words))
# 读取并估计一段文本的难度
with open('wordlist.txt', 'r') as f:
text = f.read()
print("Text difficulty level:", estimator.estimate_text_difficulty(text))