import pickle import re from collections import defaultdict from concurrent.futures import ThreadPoolExecutor from gensim.models import KeyedVectors from wordfreqCMD import remove_punctuation, freq from gensim.models import KeyedVectors class VocabularyLevelEstimator: def __init__(self, word_vectors_path='./wiki-news-300d-1M.vec', words_and_tests_pickle='words_and_tests.p'): print(f"Loading word vectors from: {word_vectors_path}") # 打印路径以确认 try: # 使用 load_word2vec_format 加载文本格式的词向量文件 self.word_vectors = KeyedVectors.load_word2vec_format(word_vectors_path, binary=False) except Exception as e: print(f"Failed to load word vectors with error: {e}") self.word_vectors = None # 或者提供一个默认值或处理异常 self.words_and_tests = self.load_record(words_and_tests_pickle) self.difficulty_cache = {} self.stop_words = set() # 初始化停用词集合 self._initialize_stop_words() def _initialize_stop_words(self): """初始化停用词""" stop_words_list = [ 'the', 'and', 'of', 'to', 'what', 'in', 'there', 'when', 'them', 'would', 'will', 'out', 'his', 'mr', 'that', 'up', 'more', 'your', 'it', 'now', 'very', 'then', 'could', 'he', 'any', 'some', 'with', 'into', 'you', 'our', 'man', 'other', 'time', 'was', 'than', 'know', 'about', 'only', 'like', 'how', 'see', 'is', 'before', 'such', 'little', 'two', 'its', 'as', 'these', 'may', 'much', 'down', 'for', 'well', 'should', 'those', 'after', 'same', 'must', 'say', 'first', 'again', 'us', 'great', 'where', 'being', 'come', 'over', 'good', 'himself', 'am', 'never', 'on', 'old', 'here', 'way', 'at', 'go', 'upon', 'have', 'had', 'without', 'my', 'day', 'be', 'but', 'though', 'from', 'not', 'too', 'another', 'this', 'even', 'still', 'her', 'yet', 'under', 'by', 'let', 'just', 'all', 'because', 'we', 'always', 'off', 'yes', 'so', 'while', 'why', 'which', 'me', 'are', 'or', 'no', 'if', 'an', 'also', 'thus', 'who', 'cannot', 'she', 'whether' ] self.stop_words.update(stop_words_list) def load_record(self, pickle_fname): """从pickle文件加载记录。""" with open(pickle_fname, 'rb') as f: return pickle.load(f) def estimate_word_difficulty(self, word): """ 根据预训练的词向量模型估计单词的难度。 如果单词不在模型中,则根据其是否出现在标准词汇表中给出默认值。 """ if word in self.difficulty_cache: return self.difficulty_cache[word] # 添加对非英文字符的支持 if not re.match(r'^[a-zA-Z]+$', word): difficulty = 0.0 else: difficulty = 5.0 # 默认难度等级,可根据实际情况调整 if word in self.word_vectors: common_words = ['the', 'is', 'at', 'which', 'on'] try: similarity_scores = [self.word_vectors.similarity(word, cw) for cw in common_words] avg_similarity = sum(similarity_scores) / len(similarity_scores) difficulty = min(10.0, max(1.0, round(5.0 - avg_similarity * 2))) # 将相似度映射到1-10的范围 except Exception as e: print(f'Error estimating difficulty for word {word}: {e}') difficulty = 5.0 # 设定一个默认值 self.difficulty_cache[word] = difficulty return difficulty def estimate_text_difficulty(self, text): """ 估计给定文本的词汇难度。 :param text: 英语文章文本。 :return: 文本的平均词汇难度等级。 """ cleaned_text = remove_punctuation(text.lower()) words_frequency = freq(cleaned_text) difficulties = [] with ThreadPoolExecutor() as executor: # 并行处理每个单词的难度估计 future_to_word = {executor.submit(self.estimate_word_difficulty, word): word for word, _ in words_frequency if word not in self.stop_words} for future in future_to_word: try: difficulties.append(float(future.result())) # 确保结果是浮点数 except Exception as exc: print(f'Error estimating difficulty for word {future_to_word[future]}: {exc}') if difficulties: return float(sum(difficulties) / len(difficulties)) # 确保返回值是浮点数 else: return 0.0 # 如果没有可评估的单词,则返回难度为0.0 def estimate_user_vocabulary_level(self, user_unknown_words): """ 根据用户不认识的单词估计用户的词汇水平。 :param user_unknown_words: 用户不认识的单词列表。 :return: 用户的平均词汇难度等级。 """ unknown_words_difficulty = [self.estimate_word_difficulty(word) for word in user_unknown_words if word not in self.stop_words] if unknown_words_difficulty: return sum(unknown_words_difficulty) / len(unknown_words_difficulty) else: return 0 # 如果没有不认识的单词,则返回难度为0 # 示例使用 if __name__ == '__main__': estimator = VocabularyLevelEstimator(word_vectors_path='path_to_pretrained_word_vectors') # 假设我们有一个用户不认识的单词列表 user_unknown_words = ['abandon', 'pursuit', 'knowledge'] print("User vocabulary level:", estimator.estimate_user_vocabulary_level(user_unknown_words)) # 读取并估计一段文本的难度 with open('wordlist.txt', 'r') as f: text = f.read() print("Text difficulty level:", estimator.estimate_text_difficulty(text))