1 changed files with 136 additions and 0 deletions
--- a/app/vocabulary.py
+++ b/app/vocabulary.py
@ -0,0 +1,136 @@
 ###########################################################################
 # Copyright 2019 (C) Hui Lan <hui.lan@cantab.net>
 # Written permission must be obtained from the author for commercial uses.
 ###########################################################################
 # Purpose: compute difficulty level of a English text
 import pickle
 import math
 from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order, map_percentages_to_levels
 import snowballstemmer
 def load_record(pickle_fname):
    with open(pickle_fname, 'rb') as f:
        d = pickle.load(f)
    return d
 ENGLISH_WORD_DIFFICULTY_DICT = {}
 def convert_test_type_to_difficulty_level(d):
    """
    对原本的单词库中的单词进行难度评级
    :param d: 存储了单词库pickle文件中的单词的字典
    :return:
    """
    result = {}
    L = list(d.keys())  # in d, we have test types (e.g., CET4,CET6,BBC) for each word
    for k in L:
        if 'CET4' in d[k]:
            result[k] = 4  # CET4 word has level 4
        elif 'OXFORD3000' in d[k]:
            result[k] = 5
        elif 'CET6' in d[k] or 'GRADUATE' in d[k]:
            result[k] = 6
        elif 'OXFORD5000' in d[k] or 'IELTS' in d[k]:
            result[k] = 7
        elif 'BBC' in d[k]:
            result[k] = 8
    global ENGLISH_WORD_DIFFICULTY_DICT
    ENGLISH_WORD_DIFFICULTY_DICT = result
    return result  # {'apple': 4, ...}
 def get_difficulty_level_for_user(d1, d2):
    """
    d2 来自于词库的35511个已标记单词
    d1 用户不会的词
    在d2的后面添加单词，没有新建一个新的字典
    """
    # TODO: convert_test_type_to_difficulty_level() should not be called every time.  Each word's difficulty level should be pre-computed.
    if ENGLISH_WORD_DIFFICULTY_DICT == {}:
        d2 = convert_test_type_to_difficulty_level(d2)  # 根据d2的标记评级{'apple': 4, 'abandon': 4, ...}
    else:
        d2 = ENGLISH_WORD_DIFFICULTY_DICT
    stemmer = snowballstemmer.stemmer('english')
    for k in d1:  # 用户的词
        if k in d2:  # 如果用户的词以原型的形式存在于词库d2中
            continue  # 无需评级，跳过
        else:
            stem = stemmer.stemWord(k)
            if stem in d2:  # 如果用户的词的词根存在于词库d2的词根库中
                d2[k] = d2[stem]  # 按照词根进行评级
            else:
                d2[k] = 3  # 如果k的词根都不在，那么就当认为是3级
    return d2
 def revert_dict(d):
    '''
    In d, word is the key, and value is a list of dates.
    In d2 (the returned value of this function), time is the key, and the value is a list of words picked at that time.
    '''
    d2 = {}
    for k in d:
        if type(d[k]) is list:  # d[k] is a list of dates.
            lst = d[k]
        elif type(d[
                      k]) is int:  # for backward compatibility.  d was sth like {'word':1}.  The value d[k] is not a list of dates, but a number representing how frequent this word had been added to the new word book.
            freq = d[k]
            lst = freq * ['2021082019']  # why choose this date?  No particular reasons.  I fix the bug in this date.
        for time_info in lst:
            date = time_info[:10]  # until hour
            if not date in d2:
                d2[date] = [k]
            else:
                d2[date].append(k)
    return d2
 class VocabularyLevelEstimator:
    _test = load_record('words_and_tests.p') # map a word to the sources where it appears
    @property
    def level(self):
        total = 0.0 # TODO: need to compute this number
        num = 1
        for word in self.word_lst:
            num += 1
            if word in self._test:
                print(f'{word} : {self._test[word]}')
            else:
                print(f'{word}')
        return total/num
 class UserVocabularyLevel(VocabularyLevelEstimator):
    def __init__(self, d):
        self.d = d
        self.word_lst = list(d.keys())
        # just look at the most recently-added words
 class ArticleVocabularyLevel(VocabularyLevelEstimator):
    def __init__(self, content):
        self.content = content
        self.word_lst = content.lower().split()
        # select the 10 most difficult words
 if __name__ == '__main__':
    d = load_record('frequency_mrlan85.pickle')
    print(d)
    user = UserVocabularyLevel(d)
    print(user.level) # level is a property
    article = ArticleVocabularyLevel('This is an interesting article')
    print(article.level)