EnglishPal/app/AAA_VocabularyLevelEstimato...

import pickle
import re
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
from gensim.models import KeyedVectors
from wordfreqCMD import remove_punctuation, freq

from gensim.models import KeyedVectors


class VocabularyLevelEstimator:
    def __init__(self, word_vectors_path='./wiki-news-300d-1M.vec', words_and_tests_pickle='words_and_tests.p'):
        print(f"Loading word vectors from: {word_vectors_path}")  # 打印路径以确认

        try:
            # 使用 load_word2vec_format 加载文本格式的词向量文件
            self.word_vectors = KeyedVectors.load_word2vec_format(word_vectors_path, binary=False)
        except Exception as e:
            print(f"Failed to load word vectors with error: {e}")
            self.word_vectors = None  # 或者提供一个默认值或处理异常

        self.words_and_tests = self.load_record(words_and_tests_pickle)
        self.difficulty_cache = {}
        self.stop_words = set()  # 初始化停用词集合
        self._initialize_stop_words()

    def _initialize_stop_words(self):
        """初始化停用词"""
        stop_words_list = [
            'the', 'and', 'of', 'to', 'what', 'in', 'there', 'when', 'them', 'would', 'will', 'out',
            'his', 'mr', 'that', 'up', 'more', 'your', 'it', 'now', 'very', 'then', 'could', 'he',
            'any', 'some', 'with', 'into', 'you', 'our', 'man', 'other', 'time', 'was', 'than',
            'know', 'about', 'only', 'like', 'how', 'see', 'is', 'before', 'such', 'little', 'two',
            'its', 'as', 'these', 'may', 'much', 'down', 'for', 'well', 'should', 'those', 'after',
            'same', 'must', 'say', 'first', 'again', 'us', 'great', 'where', 'being', 'come', 'over',
            'good', 'himself', 'am', 'never', 'on', 'old', 'here', 'way', 'at', 'go', 'upon', 'have',
            'had', 'without', 'my', 'day', 'be', 'but', 'though', 'from', 'not', 'too', 'another',
            'this', 'even', 'still', 'her', 'yet', 'under', 'by', 'let', 'just', 'all', 'because',
            'we', 'always', 'off', 'yes', 'so', 'while', 'why', 'which', 'me', 'are', 'or', 'no',
            'if', 'an', 'also', 'thus', 'who', 'cannot', 'she', 'whether'
        ]
        self.stop_words.update(stop_words_list)

    def load_record(self, pickle_fname):
        """从pickle文件加载记录。"""
        with open(pickle_fname, 'rb') as f:
            return pickle.load(f)

    def estimate_word_difficulty(self, word):
        """
        根据预训练的词向量模型估计单词的难度。
        如果单词不在模型中，则根据其是否出现在标准词汇表中给出默认值。
        """
        if word in self.difficulty_cache:
            return self.difficulty_cache[word]

        # 添加对非英文字符的支持
        if not re.match(r'^[a-zA-Z]+$', word):
            difficulty = 0.0
        else:
            difficulty = 5.0  # 默认难度等级，可根据实际情况调整
            if word in self.word_vectors:
                common_words = ['the', 'is', 'at', 'which', 'on']
                try:
                    similarity_scores = [self.word_vectors.similarity(word, cw) for cw in common_words]
                    avg_similarity = sum(similarity_scores) / len(similarity_scores)
                    difficulty = min(10.0, max(1.0, round(5.0 - avg_similarity * 2)))  # 将相似度映射到1-10的范围
                except Exception as e:
                    print(f'Error estimating difficulty for word {word}: {e}')
                    difficulty = 5.0  # 设定一个默认值

        self.difficulty_cache[word] = difficulty
        return difficulty

    def estimate_text_difficulty(self, text):
        """
        估计给定文本的词汇难度。
        :param text: 英语文章文本。
        :return: 文本的平均词汇难度等级。
        """
        cleaned_text = remove_punctuation(text.lower())
        words_frequency = freq(cleaned_text)
        difficulties = []

        with ThreadPoolExecutor() as executor:
            # 并行处理每个单词的难度估计
            future_to_word = {executor.submit(self.estimate_word_difficulty, word): word for word, _ in words_frequency if word not in self.stop_words}
            for future in future_to_word:
                try:
                    difficulties.append(float(future.result()))  # 确保结果是浮点数
                except Exception as exc:
                    print(f'Error estimating difficulty for word {future_to_word[future]}: {exc}')

        if difficulties:
            return float(sum(difficulties) / len(difficulties))  # 确保返回值是浮点数
        else:
            return 0.0  # 如果没有可评估的单词，则返回难度为0.0

    def estimate_user_vocabulary_level(self, user_unknown_words):
        """
        根据用户不认识的单词估计用户的词汇水平。
        :param user_unknown_words: 用户不认识的单词列表。
        :return: 用户的平均词汇难度等级。
        """
        unknown_words_difficulty = [self.estimate_word_difficulty(word) for word in user_unknown_words if word not in self.stop_words]
        if unknown_words_difficulty:
            return sum(unknown_words_difficulty) / len(unknown_words_difficulty)
        else:
            return 0  # 如果没有不认识的单词，则返回难度为0


# 示例使用
if __name__ == '__main__':
    estimator = VocabularyLevelEstimator(word_vectors_path='path_to_pretrained_word_vectors')

    # 假设我们有一个用户不认识的单词列表
    user_unknown_words = ['abandon', 'pursuit', 'knowledge']
    print("User vocabulary level:", estimator.estimate_user_vocabulary_level(user_unknown_words))

    # 读取并估计一段文本的难度
    with open('wordlist.txt', 'r') as f:
        text = f.read()
    print("Text difficulty level:", estimator.estimate_text_difficulty(text))