Added a vocabulary.py,fixed bug585...likely,at least it works

Bug585-WangXuan
王轩 2025-07-02 06:10:09 +08:00
parent d9512c929b
commit 72fa5127b4
1 changed files with 136 additions and 0 deletions

136
app/vocabulary.py Normal file
View File

@ -0,0 +1,136 @@
###########################################################################
# Copyright 2019 (C) Hui Lan <hui.lan@cantab.net>
# Written permission must be obtained from the author for commercial uses.
###########################################################################
# Purpose: compute difficulty level of a English text
import pickle
import math
from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order, map_percentages_to_levels
import snowballstemmer
def load_record(pickle_fname):
with open(pickle_fname, 'rb') as f:
d = pickle.load(f)
return d
ENGLISH_WORD_DIFFICULTY_DICT = {}
def convert_test_type_to_difficulty_level(d):
"""
对原本的单词库中的单词进行难度评级
:param d: 存储了单词库pickle文件中的单词的字典
:return:
"""
result = {}
L = list(d.keys()) # in d, we have test types (e.g., CET4,CET6,BBC) for each word
for k in L:
if 'CET4' in d[k]:
result[k] = 4 # CET4 word has level 4
elif 'OXFORD3000' in d[k]:
result[k] = 5
elif 'CET6' in d[k] or 'GRADUATE' in d[k]:
result[k] = 6
elif 'OXFORD5000' in d[k] or 'IELTS' in d[k]:
result[k] = 7
elif 'BBC' in d[k]:
result[k] = 8
global ENGLISH_WORD_DIFFICULTY_DICT
ENGLISH_WORD_DIFFICULTY_DICT = result
return result # {'apple': 4, ...}
def get_difficulty_level_for_user(d1, d2):
"""
d2 来自于词库的35511个已标记单词
d1 用户不会的词
在d2的后面添加单词没有新建一个新的字典
"""
# TODO: convert_test_type_to_difficulty_level() should not be called every time. Each word's difficulty level should be pre-computed.
if ENGLISH_WORD_DIFFICULTY_DICT == {}:
d2 = convert_test_type_to_difficulty_level(d2) # 根据d2的标记评级{'apple': 4, 'abandon': 4, ...}
else:
d2 = ENGLISH_WORD_DIFFICULTY_DICT
stemmer = snowballstemmer.stemmer('english')
for k in d1: # 用户的词
if k in d2: # 如果用户的词以原型的形式存在于词库d2中
continue # 无需评级,跳过
else:
stem = stemmer.stemWord(k)
if stem in d2: # 如果用户的词的词根存在于词库d2的词根库中
d2[k] = d2[stem] # 按照词根进行评级
else:
d2[k] = 3 # 如果k的词根都不在那么就当认为是3级
return d2
def revert_dict(d):
'''
In d, word is the key, and value is a list of dates.
In d2 (the returned value of this function), time is the key, and the value is a list of words picked at that time.
'''
d2 = {}
for k in d:
if type(d[k]) is list: # d[k] is a list of dates.
lst = d[k]
elif type(d[
k]) is int: # for backward compatibility. d was sth like {'word':1}. The value d[k] is not a list of dates, but a number representing how frequent this word had been added to the new word book.
freq = d[k]
lst = freq * ['2021082019'] # why choose this date? No particular reasons. I fix the bug in this date.
for time_info in lst:
date = time_info[:10] # until hour
if not date in d2:
d2[date] = [k]
else:
d2[date].append(k)
return d2
class VocabularyLevelEstimator:
_test = load_record('words_and_tests.p') # map a word to the sources where it appears
@property
def level(self):
total = 0.0 # TODO: need to compute this number
num = 1
for word in self.word_lst:
num += 1
if word in self._test:
print(f'{word} : {self._test[word]}')
else:
print(f'{word}')
return total/num
class UserVocabularyLevel(VocabularyLevelEstimator):
def __init__(self, d):
self.d = d
self.word_lst = list(d.keys())
# just look at the most recently-added words
class ArticleVocabularyLevel(VocabularyLevelEstimator):
def __init__(self, content):
self.content = content
self.word_lst = content.lower().split()
# select the 10 most difficult words
if __name__ == '__main__':
d = load_record('frequency_mrlan85.pickle')
print(d)
user = UserVocabularyLevel(d)
print(user.level) # level is a property
article = ArticleVocabularyLevel('This is an interesting article')
print(article.level)