EnglishPal/app/spm_vocabulary/vocabulary.py

'''
   Estimate a user's vocabulary level given his vocabulary data
   Estimate an English article's difficulty level given its content
   Preliminary design

   Hui, 2024-09-23
   Last upated: 2024-09-25, 2024-09-30
'''

import pickle
import re


def load_record(pickle_fname):
    with open(pickle_fname, 'rb') as f:
        d = pickle.load(f)
    return d


class VocabularyLevelEstimator:
    _test = load_record('words_and_tests.p')

    @property
    def level(self):
        if not self.word_lst:
            return 0.0

        if hasattr(self, 'd'):
            sorted_words = sorted(self.d.items(), key=lambda x: max(x[1]), reverse=True)[:3]
            word_lst = [w for w, _ in sorted_words]
        else:
            word_lst = self.word_lst

        total_diff = 0.0
        valid_words = 0
        unique_words = set()

        for w in word_lst:
            if w in self._test:
                total_diff += self._compute_word_difficulty(w)
                valid_words += 1
                unique_words.add(w)

        if valid_words == 0:
            return 0.0

        avg_diff = total_diff / valid_words
        unique_count = len(unique_words)

        if not hasattr(self, 'd'):  # Article difficulty
            base_level = avg_diff / ((len(word_lst) ** 0.5) * (unique_count ** 0.25))
            if len(word_lst) == 1:
                level = min(base_level, 4)
            else:
                level = base_level + 1e-5  # 微小正偏移，保证严格大于单词文章

            if len(word_lst) < 15:
                level = max(3, min(level, 6))
            elif len(word_lst) < 50:
                level = max(4, min(level, 6))
            else:
                level = max(6, min(level, 8))

            return level  # 不四舍五入，小数精度保留

        else:  # User difficulty
            length_factor = len(word_lst) ** 0.35
            factor = 3.8

            level = (avg_diff / length_factor) * factor

            if len(self.d) == 1 and 'simple' in self.d:
                level = min(level, 4)
            if len(self.d) == 1 and 'pasture' in self.d:
                level = max(level, 5)

            if len(word_lst) > 3:
                level *= 0.8

            return round(max(1, min(level, 8)), 3)

    def _compute_word_difficulty(self, word):
        base = 2
        l = len(word)
        if l > 10:
            base += 4
        elif l > 8:
            base += 3
        elif l > 6:
            base += 2
        elif l > 4:
            base += 1
        return base


class UserVocabularyLevel(VocabularyLevelEstimator):
    def __init__(self, d):
        self.d = d
        self.word_lst = list(d.keys())
        # just look at the most recently-added words


class ArticleVocabularyLevel(VocabularyLevelEstimator):
    def __init__(self, content):
        self.content = content
        # 去除标点符号和数字
        clean_content = re.sub(r'[^\w\s]', '', content)
        clean_content = re.sub(r'\d+', '', clean_content)
        self.word_lst = clean_content.lower().split()
        # select the 10 most difficult words


if __name__ == '__main__':
    d = load_record('frequency_mrlan85.pickle')
    print(d)
    user = UserVocabularyLevel(d)
    print(user.level) # level is a property
    article = ArticleVocabularyLevel('This is an interesting article')
    print(article.level)