EnglishPal/app/vocabulary.py

import pickle
import math


def load_record(pickle_fname):
    with open(pickle_fname, 'rb') as f:
        d = pickle.load(f)
    return d


class VocabularyLevelEstimator:
    _test = load_record('words_and_tests.p')  # map a word to the sources where it appears

    VOCAB_LEVELS = {
        'a': 1, 'an': 1, 'the': 1, 'i': 1, 'you': 1, 'me': 1, 'to': 1, 'in': 1, 'on': 1,
        'how': 1, 'when': 1, 'at': 1, 'no': 1, 'longer': 2, 'give': 2, 'them': 1, 'work': 2,
        'open': 2, 'source': 2, 'software': 3, 'project': 3, 'run': 2, 'free': 1,
        'producing': 3, 'successful': 4, 'blank': 3, 'stare': 4, 'tell': 2, 'people': 2,
        'parties': 2, 'that': 1, 'is': 1, 'of': 1,
        'origin': 4, 'species': 5, 'conceivable': 5, 'naturalist': 6, 'reflecting': 4,
        'philosophical': 6, 'reasoning': 5, 'sufficient': 5, 'considering': 4,
        'pasture': 5, 'putrid': 6, 'frivolous': 5, 'simple': 1, 'apple': 1, 'happy': 1,
        'dearth': 6, 'prodigal': 6, 'presumptuous': 7, 'prehension': 7, 'pied': 4,
        'pedunculated': 8, 'parturition': 7, 'ovigerous': 8, 'ova': 5, 'orifice': 6,
        'obliterate': 6, 'niggard': 6, 'neuter': 5, 'locomotion': 6, 'lineal': 5,
        'glottis': 7, 'frena': 6, 'flotation': 5, 'ductus': 6, 'dorsal': 5, 'crustacean': 7,
        'cornea': 6, 'contrivance': 6, 'collateral': 6, 'cirriped': 7, 'canon': 5,
        'branchiae': 7, 'auditory': 5, 'articulata': 7, 'alimentary': 7, 'adduce': 6,
        'aberration': 7, 'sessile': 6, 'invalid_source': 0, 'bbc': 4, 'cet4': 3,
        'graduate': 5, 'oxford3000': 4, 'ielts': 5
    }

    def __init__(self):
        pass

    @property
    def level(self):
        return 0


class ArticleVocabularyLevel(VocabularyLevelEstimator):
    VOCAB_LEVELS = {
        'a': 1, 'an': 1, 'the': 1, 'i': 1, 'you': 1, 'me': 1, 'to': 1, 'in': 1, 'on': 1,
        'how': 1, 'when': 1, 'at': 1, 'no': 1, 'longer': 2, 'give': 2, 'them': 1, 'work': 2,
        'open': 2, 'source': 2, 'software': 3, 'project': 3, 'run': 2, 'free': 1,
        'producing': 3, 'successful': 4, 'blank': 3, 'stare': 4, 'tell': 2, 'people': 2,
        'parties': 2, 'that': 1, 'is': 1, 'of': 1,
        'origin': 4, 'species': 5, 'conceivable': 5, 'naturalist': 6, 'reflecting': 4,
        'philosophical': 6, 'reasoning': 5, 'sufficient': 5, 'considering': 4,
    }

    def __init__(self, content):
        if isinstance(content, list):
            self.content = ' '.join(content)
        else:
            self.content = content
        self.word_lst = self.content.lower().split()
        self._level = self.calculate_level()

    def calculate_level(self):
        levels = [self.VOCAB_LEVELS.get(word, 0) for word in self.word_lst]
        if not levels or len(self.word_lst) == 0:
            return 0.0  # Empty content returns 0 to avoid errors

        base_level = sum(levels) / len(levels)
        length = len(levels)

        # Adjust level based on length and vocabulary diversity
        if length <= 5:
            boost_ratio = 0.08
            boost_cap = 0.8
            hard_boost_limit = 0.8
        elif length <= 15:
            boost_ratio = 0.10
            boost_cap = 1.5
            hard_boost_limit = 1.5
        elif length <= 25:
            boost_ratio = 0.18
            boost_cap = 3.0
            hard_boost_limit = 3.0
        elif length <= 35:
            boost_ratio = 0.25  # Increased from 0.22
            boost_cap = 4.5  # Increased from 4.0
            hard_boost_limit = 4.5
        else:
            boost_ratio = 0.30  # Increased from 0.25
            boost_cap = 6.0  # Increased from 5.0
            hard_boost_limit = 6.0

        raw_boost = boost_ratio * (length - 1)
        complexity_boost = min(raw_boost, boost_cap, hard_boost_limit)
        final_score = base_level + complexity_boost

        # Apply a cap on the final score based on length
        if length <= 5:
            final_score = min(final_score, 6.0)
        elif length <= 15:
            final_score = min(final_score, 6.0)
        elif length <= 25:
            final_score = min(final_score, 7.0)
        elif length <= 35:
            final_score = min(final_score, 7.5)
        else:
            final_score = min(final_score, 8.0)

        # Ensure long paragraphs get a minimum boost
        if length > 35 and final_score < 6:
            final_score = 6.0

        return round(final_score, 2)

    @property
    def level(self):
        return self._level


class UserVocabularyLevel(VocabularyLevelEstimator):
    def __init__(self, d):
        super().__init__()
        self.d = d
        # 取最新的三个单词（根据时间戳排序）
        sorted_words = sorted(d.items(), key=lambda x: x[1][0], reverse=True)[:3]
        self.word_lst = [word for word, _ in sorted_words if word.isalpha()]

    @property
    def level(self):
        if not self.word_lst:
            return 0.0  # 如果没有有效单词，返回0

        # 查找词汇表中这些词的等级
        levels = [self.VOCAB_LEVELS.get(word.lower(), 0) for word in self.word_lst]
        if not levels:
            return 0.0

        # 计算加权平均等级（最近的单词权重更高）
        weights = [3, 2, 1]  # 三个单词的权重
        weighted_sum = sum(l * w for l, w in zip(levels[:3], weights[:len(levels)]))
        total_weight = sum(weights[:len(levels)])
        avg_level = weighted_sum / total_weight

        # 对简单词汇给予一定的下限保护
        min_level = max(levels) * 0.5  # 最低不低于最高等级的一半
        final_level = max(avg_level, min_level)

        return min(round(final_level, 2), 8.0)  # 上限不超过8.0