forked from mrlan/EnglishPal
123 lines
5.8 KiB
Python
123 lines
5.8 KiB
Python
|
import pickle
|
|||
|
import re
|
|||
|
from collections import defaultdict
|
|||
|
from concurrent.futures import ThreadPoolExecutor
|
|||
|
from gensim.models import KeyedVectors
|
|||
|
from wordfreqCMD import remove_punctuation, freq
|
|||
|
|
|||
|
from gensim.models import KeyedVectors
|
|||
|
|
|||
|
|
|||
|
class VocabularyLevelEstimator:
|
|||
|
def __init__(self, word_vectors_path='./wiki-news-300d-1M.vec', words_and_tests_pickle='words_and_tests.p'):
|
|||
|
print(f"Loading word vectors from: {word_vectors_path}") # 打印路径以确认
|
|||
|
|
|||
|
try:
|
|||
|
# 使用 load_word2vec_format 加载文本格式的词向量文件
|
|||
|
self.word_vectors = KeyedVectors.load_word2vec_format(word_vectors_path, binary=False)
|
|||
|
except Exception as e:
|
|||
|
print(f"Failed to load word vectors with error: {e}")
|
|||
|
self.word_vectors = None # 或者提供一个默认值或处理异常
|
|||
|
|
|||
|
self.words_and_tests = self.load_record(words_and_tests_pickle)
|
|||
|
self.difficulty_cache = {}
|
|||
|
self.stop_words = set() # 初始化停用词集合
|
|||
|
self._initialize_stop_words()
|
|||
|
|
|||
|
def _initialize_stop_words(self):
|
|||
|
"""初始化停用词"""
|
|||
|
stop_words_list = [
|
|||
|
'the', 'and', 'of', 'to', 'what', 'in', 'there', 'when', 'them', 'would', 'will', 'out',
|
|||
|
'his', 'mr', 'that', 'up', 'more', 'your', 'it', 'now', 'very', 'then', 'could', 'he',
|
|||
|
'any', 'some', 'with', 'into', 'you', 'our', 'man', 'other', 'time', 'was', 'than',
|
|||
|
'know', 'about', 'only', 'like', 'how', 'see', 'is', 'before', 'such', 'little', 'two',
|
|||
|
'its', 'as', 'these', 'may', 'much', 'down', 'for', 'well', 'should', 'those', 'after',
|
|||
|
'same', 'must', 'say', 'first', 'again', 'us', 'great', 'where', 'being', 'come', 'over',
|
|||
|
'good', 'himself', 'am', 'never', 'on', 'old', 'here', 'way', 'at', 'go', 'upon', 'have',
|
|||
|
'had', 'without', 'my', 'day', 'be', 'but', 'though', 'from', 'not', 'too', 'another',
|
|||
|
'this', 'even', 'still', 'her', 'yet', 'under', 'by', 'let', 'just', 'all', 'because',
|
|||
|
'we', 'always', 'off', 'yes', 'so', 'while', 'why', 'which', 'me', 'are', 'or', 'no',
|
|||
|
'if', 'an', 'also', 'thus', 'who', 'cannot', 'she', 'whether'
|
|||
|
]
|
|||
|
self.stop_words.update(stop_words_list)
|
|||
|
|
|||
|
def load_record(self, pickle_fname):
|
|||
|
"""从pickle文件加载记录。"""
|
|||
|
with open(pickle_fname, 'rb') as f:
|
|||
|
return pickle.load(f)
|
|||
|
|
|||
|
def estimate_word_difficulty(self, word):
|
|||
|
"""
|
|||
|
根据预训练的词向量模型估计单词的难度。
|
|||
|
如果单词不在模型中,则根据其是否出现在标准词汇表中给出默认值。
|
|||
|
"""
|
|||
|
if word in self.difficulty_cache:
|
|||
|
return self.difficulty_cache[word]
|
|||
|
|
|||
|
# 添加对非英文字符的支持
|
|||
|
if not re.match(r'^[a-zA-Z]+$', word):
|
|||
|
difficulty = 0.0
|
|||
|
else:
|
|||
|
difficulty = 5.0 # 默认难度等级,可根据实际情况调整
|
|||
|
if word in self.word_vectors:
|
|||
|
common_words = ['the', 'is', 'at', 'which', 'on']
|
|||
|
try:
|
|||
|
similarity_scores = [self.word_vectors.similarity(word, cw) for cw in common_words]
|
|||
|
avg_similarity = sum(similarity_scores) / len(similarity_scores)
|
|||
|
difficulty = min(10.0, max(1.0, round(5.0 - avg_similarity * 2))) # 将相似度映射到1-10的范围
|
|||
|
except Exception as e:
|
|||
|
print(f'Error estimating difficulty for word {word}: {e}')
|
|||
|
difficulty = 5.0 # 设定一个默认值
|
|||
|
|
|||
|
self.difficulty_cache[word] = difficulty
|
|||
|
return difficulty
|
|||
|
|
|||
|
def estimate_text_difficulty(self, text):
|
|||
|
"""
|
|||
|
估计给定文本的词汇难度。
|
|||
|
:param text: 英语文章文本。
|
|||
|
:return: 文本的平均词汇难度等级。
|
|||
|
"""
|
|||
|
cleaned_text = remove_punctuation(text.lower())
|
|||
|
words_frequency = freq(cleaned_text)
|
|||
|
difficulties = []
|
|||
|
|
|||
|
with ThreadPoolExecutor() as executor:
|
|||
|
# 并行处理每个单词的难度估计
|
|||
|
future_to_word = {executor.submit(self.estimate_word_difficulty, word): word for word, _ in words_frequency if word not in self.stop_words}
|
|||
|
for future in future_to_word:
|
|||
|
try:
|
|||
|
difficulties.append(float(future.result())) # 确保结果是浮点数
|
|||
|
except Exception as exc:
|
|||
|
print(f'Error estimating difficulty for word {future_to_word[future]}: {exc}')
|
|||
|
|
|||
|
if difficulties:
|
|||
|
return float(sum(difficulties) / len(difficulties)) # 确保返回值是浮点数
|
|||
|
else:
|
|||
|
return 0.0 # 如果没有可评估的单词,则返回难度为0.0
|
|||
|
|
|||
|
def estimate_user_vocabulary_level(self, user_unknown_words):
|
|||
|
"""
|
|||
|
根据用户不认识的单词估计用户的词汇水平。
|
|||
|
:param user_unknown_words: 用户不认识的单词列表。
|
|||
|
:return: 用户的平均词汇难度等级。
|
|||
|
"""
|
|||
|
unknown_words_difficulty = [self.estimate_word_difficulty(word) for word in user_unknown_words if word not in self.stop_words]
|
|||
|
if unknown_words_difficulty:
|
|||
|
return sum(unknown_words_difficulty) / len(unknown_words_difficulty)
|
|||
|
else:
|
|||
|
return 0 # 如果没有不认识的单词,则返回难度为0
|
|||
|
|
|||
|
|
|||
|
# 示例使用
|
|||
|
if __name__ == '__main__':
|
|||
|
estimator = VocabularyLevelEstimator(word_vectors_path='path_to_pretrained_word_vectors')
|
|||
|
|
|||
|
# 假设我们有一个用户不认识的单词列表
|
|||
|
user_unknown_words = ['abandon', 'pursuit', 'knowledge']
|
|||
|
print("User vocabulary level:", estimator.estimate_user_vocabulary_level(user_unknown_words))
|
|||
|
|
|||
|
# 读取并估计一段文本的难度
|
|||
|
with open('wordlist.txt', 'r') as f:
|
|||
|
text = f.read()
|
|||
|
print("Text difficulty level:", estimator.estimate_text_difficulty(text))
|