EnglishPal/app/vocabulary.py

'''
   Estimate a user's vocabulary level given his vocabulary data
   Estimate an English article's difficulty level given its content
   Preliminary design

   Hui, 2024-09-23
   Last upated: 2024-09-25, 2024-09-30
'''

import pickle
import re
from collections import defaultdict


def load_record(pickle_fname):
    with open(pickle_fname, 'rb') as f:
        d = pickle.load(f)
    return d


class VocabularyLevelEstimator:
    _test = load_record('words_and_tests.p')  # 单词到来源的映射
    _source_levels = {  # 来源到难度分数的映射
        'BBC': 1,
        'CET4': 2,
        'CET6': 3,
        'GRADUATE': 4,
        'OXFORD3000': 1,
        'TOEFL': 5,
        'IELTS': 5,
        'GRE': 7
    }

    def get_word_level(self, word):
        """获取单词难度分数"""
        if word in self._test:
            sources = self._test[word]
            word_levels = [
                self._source_levels[src]
                for src in sources
                if src in self._source_levels
            ]
            if word_levels:
                # 使用最高分
                return max(word_levels)
        return 0  # 未知单词难度为0


class UserVocabularyLevel(VocabularyLevelEstimator):
    def __init__(self, d, recent_count=3):
        self.d = d
        # 按时间戳排序（最新的在前）
        sorted_words = sorted(d.items(), key=lambda x: max(x[1]), reverse=True)
        # 取最近的单词（默认3个）
        self.word_lst = [word for word, _ in sorted_words[:recent_count]]

    @property
    def level(self):
        if not self.word_lst:
            return 0.0

        # 使用最高分
        max_score = 0
        for word in self.word_lst:
            score = self.get_word_level(word)
            if score > max_score:
                max_score = score
        return max_score


class ArticleVocabularyLevel(VocabularyLevelEstimator):
    def __init__(self, content):
        self.content = content
        # 更智能的分词，处理连字符和缩写
        words = re.findall(r'\b[\w-]+\b', content.lower())

        # 计算每个单词的频率和分数
        word_freq = defaultdict(int)
        word_scores = {}

        for word in words:
            if word.isalpha():
                word_freq[word] += 1
                if word not in word_scores:
                    word_scores[word] = self.get_word_level(word)

        # 计算加权分数（频率 * 分数）
        weighted_scores = []
        for word, score in word_scores.items():
            if score > 0:
                weighted_scores.append((score * word_freq[word], score, word))

        # 如果没有有效单词，直接返回
        if not weighted_scores:
            self.difficult_words = []
            return

        # 按加权分数排序
        weighted_scores.sort(reverse=True)

        # 只保留前20%的单词（至少5个，最多15个）
        num_top_words = max(5, min(15, len(weighted_scores) // 5))
        self.difficult_words = [score for _, score, _ in weighted_scores[:num_top_words]]

    @property
    def level(self):
        if not self.difficult_words:
            return 0.0

        # 使用最高分
        return max(self.difficult_words)

if __name__ == '__main__':
    d = load_record('frequency_mrlan85.pickle')
    print(d)
    user = UserVocabularyLevel(d)
    print(user.level) # level is a property
    article = ArticleVocabularyLevel('This is an interesting article')
    print(article.level)