forked from mrlan/EnglishPal
123 lines
5.8 KiB
Python
123 lines
5.8 KiB
Python
import pickle
|
||
import re
|
||
from collections import defaultdict
|
||
from concurrent.futures import ThreadPoolExecutor
|
||
from gensim.models import KeyedVectors
|
||
from wordfreqCMD import remove_punctuation, freq
|
||
|
||
from gensim.models import KeyedVectors
|
||
|
||
|
||
class VocabularyLevelEstimator:
|
||
def __init__(self, word_vectors_path='./wiki-news-300d-1M.vec', words_and_tests_pickle='words_and_tests.p'):
|
||
print(f"Loading word vectors from: {word_vectors_path}") # 打印路径以确认
|
||
|
||
try:
|
||
# 使用 load_word2vec_format 加载文本格式的词向量文件
|
||
self.word_vectors = KeyedVectors.load_word2vec_format(word_vectors_path, binary=False)
|
||
except Exception as e:
|
||
print(f"Failed to load word vectors with error: {e}")
|
||
self.word_vectors = None # 或者提供一个默认值或处理异常
|
||
|
||
self.words_and_tests = self.load_record(words_and_tests_pickle)
|
||
self.difficulty_cache = {}
|
||
self.stop_words = set() # 初始化停用词集合
|
||
self._initialize_stop_words()
|
||
|
||
def _initialize_stop_words(self):
|
||
"""初始化停用词"""
|
||
stop_words_list = [
|
||
'the', 'and', 'of', 'to', 'what', 'in', 'there', 'when', 'them', 'would', 'will', 'out',
|
||
'his', 'mr', 'that', 'up', 'more', 'your', 'it', 'now', 'very', 'then', 'could', 'he',
|
||
'any', 'some', 'with', 'into', 'you', 'our', 'man', 'other', 'time', 'was', 'than',
|
||
'know', 'about', 'only', 'like', 'how', 'see', 'is', 'before', 'such', 'little', 'two',
|
||
'its', 'as', 'these', 'may', 'much', 'down', 'for', 'well', 'should', 'those', 'after',
|
||
'same', 'must', 'say', 'first', 'again', 'us', 'great', 'where', 'being', 'come', 'over',
|
||
'good', 'himself', 'am', 'never', 'on', 'old', 'here', 'way', 'at', 'go', 'upon', 'have',
|
||
'had', 'without', 'my', 'day', 'be', 'but', 'though', 'from', 'not', 'too', 'another',
|
||
'this', 'even', 'still', 'her', 'yet', 'under', 'by', 'let', 'just', 'all', 'because',
|
||
'we', 'always', 'off', 'yes', 'so', 'while', 'why', 'which', 'me', 'are', 'or', 'no',
|
||
'if', 'an', 'also', 'thus', 'who', 'cannot', 'she', 'whether'
|
||
]
|
||
self.stop_words.update(stop_words_list)
|
||
|
||
def load_record(self, pickle_fname):
|
||
"""从pickle文件加载记录。"""
|
||
with open(pickle_fname, 'rb') as f:
|
||
return pickle.load(f)
|
||
|
||
def estimate_word_difficulty(self, word):
|
||
"""
|
||
根据预训练的词向量模型估计单词的难度。
|
||
如果单词不在模型中,则根据其是否出现在标准词汇表中给出默认值。
|
||
"""
|
||
if word in self.difficulty_cache:
|
||
return self.difficulty_cache[word]
|
||
|
||
# 添加对非英文字符的支持
|
||
if not re.match(r'^[a-zA-Z]+$', word):
|
||
difficulty = 0.0
|
||
else:
|
||
difficulty = 5.0 # 默认难度等级,可根据实际情况调整
|
||
if word in self.word_vectors:
|
||
common_words = ['the', 'is', 'at', 'which', 'on']
|
||
try:
|
||
similarity_scores = [self.word_vectors.similarity(word, cw) for cw in common_words]
|
||
avg_similarity = sum(similarity_scores) / len(similarity_scores)
|
||
difficulty = min(10.0, max(1.0, round(5.0 - avg_similarity * 2))) # 将相似度映射到1-10的范围
|
||
except Exception as e:
|
||
print(f'Error estimating difficulty for word {word}: {e}')
|
||
difficulty = 5.0 # 设定一个默认值
|
||
|
||
self.difficulty_cache[word] = difficulty
|
||
return difficulty
|
||
|
||
def estimate_text_difficulty(self, text):
|
||
"""
|
||
估计给定文本的词汇难度。
|
||
:param text: 英语文章文本。
|
||
:return: 文本的平均词汇难度等级。
|
||
"""
|
||
cleaned_text = remove_punctuation(text.lower())
|
||
words_frequency = freq(cleaned_text)
|
||
difficulties = []
|
||
|
||
with ThreadPoolExecutor() as executor:
|
||
# 并行处理每个单词的难度估计
|
||
future_to_word = {executor.submit(self.estimate_word_difficulty, word): word for word, _ in words_frequency if word not in self.stop_words}
|
||
for future in future_to_word:
|
||
try:
|
||
difficulties.append(float(future.result())) # 确保结果是浮点数
|
||
except Exception as exc:
|
||
print(f'Error estimating difficulty for word {future_to_word[future]}: {exc}')
|
||
|
||
if difficulties:
|
||
return float(sum(difficulties) / len(difficulties)) # 确保返回值是浮点数
|
||
else:
|
||
return 0.0 # 如果没有可评估的单词,则返回难度为0.0
|
||
|
||
def estimate_user_vocabulary_level(self, user_unknown_words):
|
||
"""
|
||
根据用户不认识的单词估计用户的词汇水平。
|
||
:param user_unknown_words: 用户不认识的单词列表。
|
||
:return: 用户的平均词汇难度等级。
|
||
"""
|
||
unknown_words_difficulty = [self.estimate_word_difficulty(word) for word in user_unknown_words if word not in self.stop_words]
|
||
if unknown_words_difficulty:
|
||
return sum(unknown_words_difficulty) / len(unknown_words_difficulty)
|
||
else:
|
||
return 0 # 如果没有不认识的单词,则返回难度为0
|
||
|
||
|
||
# 示例使用
|
||
if __name__ == '__main__':
|
||
estimator = VocabularyLevelEstimator(word_vectors_path='path_to_pretrained_word_vectors')
|
||
|
||
# 假设我们有一个用户不认识的单词列表
|
||
user_unknown_words = ['abandon', 'pursuit', 'knowledge']
|
||
print("User vocabulary level:", estimator.estimate_user_vocabulary_level(user_unknown_words))
|
||
|
||
# 读取并估计一段文本的难度
|
||
with open('wordlist.txt', 'r') as f:
|
||
text = f.read()
|
||
print("Text difficulty level:", estimator.estimate_text_difficulty(text)) |