EnglishPal/app/vocabulary.py

'''
   Estimate a user's vocabulary level given his vocabulary data
   Estimate an English article's difficulty level given its content
   Preliminary design

   Hui, 2024-09-23
   Last upated: 2024-09-25, 2024-09-30
'''

import pickle

import nltk

DIFFICULTY_MAPPING = {
    'BBC': 2,       # 基础词汇
    'CET4': 3,      # 四级（大学英语）
    'CET6': 4,      # 六级
    'GRADUATE': 5,  # 考研词汇
    'IELTS': 6,     # 雅思
    'OXFORD3000': 4, # 牛津3000核心词
    'OXFORD5000': 7 # 牛津5000词
}


def load_record(pickle_fname):
    with open(pickle_fname, 'rb') as f:
        d = pickle.load(f)
    return d


class VocabularyLevelEstimator:
    _test = load_record('words_and_tests.p') # map a word to the sources where it appears

    @property
    def level(self):
        if not self.word_lst:  # 检查是否有有效词汇
            return 0.0  # 或根据需求返回默认值
        total = 0.0
        valid_words = 0
        for word in self.word_lst:
            if word in self._test:
                sources = self._test[word]
                total += max(DIFFICULTY_MAPPING.get(src, 0) for src in sources)
                valid_words += 1
        return total / valid_words if valid_words > 0 else 0.0


class UserVocabularyLevel(VocabularyLevelEstimator):
    def __init__(self, d, recent_n=3):
        self.d = d
        self.recent_n = recent_n
        # 按时间戳降序排序，取前recent_n个单词
        sorted_words = sorted(d.keys(), key=lambda word: d[word][-1], reverse=True)
        self.word_lst = sorted_words[:recent_n]


class ArticleVocabularyLevel(VocabularyLevelEstimator):
    def __init__(self, content):
        self.content = content
        # 预处理：分词、小写、去标点、去停用词
        import re
        from nltk.corpus import stopwords
        nltk.download('stopwords')
        stop_words = set(stopwords.words('english'))
        words = re.findall(r'\b\w+\b', content.lower())
        self.word_lst = [word for word in words if word not in stop_words]
        # 按难度分筛选前10个最难的单词
        self.word_lst = sorted(
            self.word_lst,
            key=lambda w: self._get_difficulty(w),
            reverse=True
        )[:10]

    def _get_difficulty(self, word):
        if word in self._test:
            return max(DIFFICULTY_MAPPING.get(src, 0) for src in self._test[word])
        return 0


if __name__ == '__main__':
    d = load_record('frequency_mrlan85.pickle')
    print(d)
    print("======================================================")
    user = UserVocabularyLevel(d)
    print(user.level) # level is a property
    print("======================================================")
    article = ArticleVocabularyLevel('This is an interesting article')
    print(article.level)